<a href="https://colab.research.google.com/github/feliperattes/soccer_pred/blob/pipeline/New_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Summary of the Project**

The prior objective of the project is to create a supervised machine learning algorithm that predicts football matches results based on the statistics of the matches

-- Steps of the project --
1. Web scrapping robot to pick all the information of the matches
2. Automatize the process of Web scrapping to all the season matches
3. Create a supervised machine learning model to predict the outcome of the matches
4. Evaluate the models

-- Metrics used -- 
Accuracy

-- 

# Preparing the Data

In [6]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv('soccer_final_123.csv', index_col=False)
pd.set_option('display.max_columns', 100)

In [8]:
data.tail(5)

Unnamed: 0.1,Unnamed: 0,HT,Score,AT,attendance,referee,HomeGoals,AwayGoals,match_report_link,Home_Poss,Away_Poss,Home_PA,Away_PA,Home_ShoT,Away_ShoT,Home_Saves,Away_Saves,HomeFouls,HomeCorners,HomeCrosses,HomeTouches,HomeTackles,HomeInterceptions,HomeAerials,HomeClearances,HomeOffsides,HomeGoalKicks,HomeThrowIns,HomeLongBalls,AwayFouls,AwayCorners,AwayCrosses,AwayTouches,AwayTackles,AwayInterceptions,AwayAerials,AwayClearances,AwayOffsides,AwayGoalKicks,AwayThrowIns,AwayLongBalls
1353,1353,Corinthians,0–0,Grêmio,,Caio Max Augusto Vieira,0,0,https://fbref.com/en/matches/1764b47f/Corinthi...,32%,68%,74%,91%,14%,20%,100%,100%,12.0,4.0,10.0,473.0,20.0,13.0,16.0,21.0,3.0,12.0,10.0,46.0,14.0,12.0,37.0,821.0,10.0,14.0,26.0,15.0,1.0,5.0,19.0,49.0
1354,1354,Sport Recife,0–1,Atl Goianiense,,Edina Alves Batista,0,1,https://fbref.com/en/matches/4f786530/Sport-Re...,63%,37%,85%,78%,9%,33%,75%,100%,17.0,10.0,39.0,626.0,15.0,12.0,16.0,15.0,1.0,5.0,21.0,43.0,16.0,5.0,17.0,464.0,14.0,23.0,16.0,34.0,2.0,10.0,20.0,45.0
1355,1355,Ceará,1–1,São Paulo,,Wagner do Nascimento Magalhaes,1,1,https://fbref.com/en/matches/9e4e8ba1/Ceara-Sa...,32%,68%,76%,89%,55%,26%,60%,83%,20.0,5.0,14.0,477.0,16.0,20.0,13.0,18.0,2.0,9.0,20.0,50.0,13.0,5.0,21.0,834.0,9.0,12.0,16.0,10.0,1.0,5.0,12.0,36.0
1356,1356,Atlético Mineiro,2–1,Botafogo (RJ),,Savio Pereira,2,1,https://fbref.com/en/matches/ab6f4914/Atletico...,54%,46%,87%,82%,55%,75%,66%,66%,25.0,10.0,21.0,631.0,11.0,15.0,12.0,6.0,2.0,6.0,15.0,33.0,19.0,2.0,7.0,548.0,14.0,9.0,9.0,12.0,4.0,4.0,18.0,67.0
1357,1357,Coritiba,0–1,Corinthians,,Leandro Pedro Vuaden,0,1,https://fbref.com/en/matches/b729e3e6/Coritiba...,53%,47%,82%,78%,27%,17%,50%,100%,13.0,4.0,25.0,657.0,11.0,13.0,24.0,17.0,1.0,7.0,22.0,59.0,25.0,3.0,18.0,593.0,12.0,13.0,22.0,20.0,2.0,10.0,23.0,69.0


In [9]:
data.isnull().sum()

Unnamed: 0             0
HT                     0
Score                  0
AT                     0
attendance           221
referee                0
HomeGoals              0
AwayGoals              0
match_report_link      0
Home_Poss              2
Away_Poss              2
Home_PA                2
Away_PA                2
Home_ShoT              6
Away_ShoT              6
Home_Saves            15
Away_Saves            15
HomeFouls             16
HomeCorners           17
HomeCrosses           15
HomeTouches            2
HomeTackles           17
HomeInterceptions     17
HomeAerials           17
HomeClearances        17
HomeOffsides          17
HomeGoalKicks         17
HomeThrowIns          17
HomeLongBalls         17
AwayFouls             16
AwayCorners           17
AwayCrosses           15
AwayTouches            2
AwayTackles           17
AwayInterceptions     17
AwayAerials           17
AwayClearances        17
AwayOffsides          17
AwayGoalKicks         17
AwayThrowIns          17


In [10]:
data["Home_Poss"] = data["Home_Poss"].str.rstrip("%")
data["Home_Poss"] = pd.to_numeric(data["Home_Poss"])
data["Home_Poss"] = data["Home_Poss"] / 100.0
data["Away_Poss"] = data["Away_Poss"].str.rstrip("%")
data["Away_Poss"] = pd.to_numeric(data["Away_Poss"])
data["Away_Poss"] = data["Away_Poss"] / 100.0

data["attendance"] = data["attendance"].str.replace(",", "")
data["attendance"] = pd.to_numeric(data["attendance"])

data["Home_PA"] = data["Home_PA"].str.rstrip("%")
data["Away_PA"] = data["Away_PA"].str.rstrip("%")
data["Home_PA"] = pd.to_numeric(data["Home_PA"])
data["Home_PA"] = data["Home_PA"] / 100.0
data["Away_PA"] = pd.to_numeric(data["Away_PA"])
data["Away_PA"] = data["Away_PA"] / 100.0

data["Home_ShoT"] = data["Home_ShoT"].str.rstrip("%")
data["Away_ShoT"] = data["Away_ShoT"].str.rstrip("%")
data["Home_ShoT"] = pd.to_numeric(data["Home_ShoT"])
data["Home_ShoT"] = data["Home_ShoT"] / 100.0
data["Away_ShoT"] = pd.to_numeric(data["Away_ShoT"])
data["Away_ShoT"] = data["Away_ShoT"] / 100.0

data["Home_Saves"] = data["Home_Saves"].str.rstrip("%")
data["Away_Saves"] = data["Away_Saves"].str.rstrip("%")
data["Home_Saves"] = pd.to_numeric(data["Home_Saves"])
data["Home_Saves"] = data["Home_Saves"] / 100.0
data["Away_Saves"] = pd.to_numeric(data["Away_Saves"])
data["Away_Saves"] = data["Away_Saves"] / 100.0

In [11]:
data.rename(columns={data.columns[0]: 'Index'}, inplace=True)

In [12]:
data['DifferenceGoals'] = data['HomeGoals'] - data['AwayGoals']

In [13]:
data['home_win'] = np.where(data['DifferenceGoals'] > 0, 1, 0)
data['away_win'] = np.where(data['DifferenceGoals'] < 0, 1, 0)

In [14]:
conditions1 = [
    (data['DifferenceGoals'] > 0),
    (data['DifferenceGoals'] < 0),
    (data['DifferenceGoals'] == 0)
    ]

values1 = [3, 0, 1]

conditions2 = [
    (data['DifferenceGoals'] > 0),
    (data['DifferenceGoals'] < 0),
    (data['DifferenceGoals'] == 0)
    ]

values2 = [0, 3, 1]

In [15]:
data['points_result_home'] = np.select(conditions1, values1)
data['points_result_away'] = np.select(conditions2, values2)
#data['result'] = np.select(conditions3, values3)

In [16]:
data = data.drop(['attendance'], axis=1)

In [17]:
# Maybe it's better dropping Saves!!

data.dropna(inplace=True)

In [18]:
data.reset_index(inplace=True)

In [19]:
data.shape

(1288, 46)

In [20]:
data.tail(20)

Unnamed: 0,index,Index,HT,Score,AT,referee,HomeGoals,AwayGoals,match_report_link,Home_Poss,Away_Poss,Home_PA,Away_PA,Home_ShoT,Away_ShoT,Home_Saves,Away_Saves,HomeFouls,HomeCorners,HomeCrosses,HomeTouches,HomeTackles,HomeInterceptions,HomeAerials,HomeClearances,HomeOffsides,HomeGoalKicks,HomeThrowIns,HomeLongBalls,AwayFouls,AwayCorners,AwayCrosses,AwayTouches,AwayTackles,AwayInterceptions,AwayAerials,AwayClearances,AwayOffsides,AwayGoalKicks,AwayThrowIns,AwayLongBalls,DifferenceGoals,home_win,away_win,points_result_home,points_result_away
1268,1338,1338,Fortaleza,2–3,São Paulo,Marcelo de Lima Henrique,2,3,https://fbref.com/en/matches/cfd8bb6b/Fortalez...,0.5,0.5,0.83,0.84,0.5,0.63,0.4,0.5,15.0,2.0,16.0,563.0,9.0,9.0,16.0,20.0,1.0,8.0,17.0,76.0,18.0,3.0,15.0,557.0,14.0,6.0,17.0,14.0,2.0,4.0,25.0,31.0,-1,0,1,0,3
1269,1339,1339,Palmeiras,2–0,Fluminense,Leandro Pedro Vuaden,2,0,https://fbref.com/en/matches/8752863e/Palmeira...,0.51,0.5,0.84,0.85,0.56,0.33,1.0,0.6,15.0,3.0,19.0,624.0,16.0,17.0,15.0,19.0,3.0,9.0,20.0,71.0,8.0,5.0,20.0,625.0,19.0,7.0,15.0,9.0,3.0,6.0,20.0,44.0,2,1,0,3,0
1270,1340,1340,Flamengo,1–1,Atl Goianiense,Rafael Traci,1,1,https://fbref.com/en/matches/0f8a74fb/Flamengo...,0.66,0.34,0.88,0.78,0.37,0.5,0.8,0.71,15.0,11.0,37.0,748.0,8.0,11.0,15.0,14.0,1.0,6.0,20.0,46.0,16.0,4.0,16.0,470.0,10.0,10.0,16.0,19.0,2.0,10.0,19.0,61.0,0,0,0,1,1
1271,1341,1341,Coritiba,1–2,Bahia,Igor Junior Benevenuto,1,2,https://fbref.com/en/matches/57dd0af8/Coritiba...,0.53,0.47,0.88,0.86,0.13,0.33,0.6,0.5,12.0,7.0,25.0,616.0,7.0,9.0,7.0,11.0,1.0,10.0,16.0,42.0,12.0,5.0,23.0,569.0,8.0,10.0,9.0,22.0,1.0,9.0,21.0,50.0,-1,0,1,0,3
1272,1342,1342,Botafogo (RJ),1–2,Bragantino,Rodolpho Toski Marques,1,2,https://fbref.com/en/matches/0a96aff0/Botafogo...,0.61,0.39,0.82,0.7,0.25,0.55,0.33,0.66,16.0,7.0,19.0,621.0,9.0,9.0,17.0,14.0,2.0,6.0,28.0,72.0,18.0,3.0,19.0,456.0,13.0,15.0,15.0,21.0,0.0,6.0,19.0,49.0,-1,0,1,0,3
1273,1343,1343,Atlético Mineiro,0–2,Atl Paranaense,Dyorgines José Padovani Andrade,0,2,https://fbref.com/en/matches/b6ee4cd1/Atletico...,0.64,0.36,0.87,0.77,0.27,0.6,0.66,1.0,7.0,9.0,27.0,750.0,10.0,17.0,7.0,5.0,1.0,3.0,26.0,70.0,23.0,1.0,6.0,487.0,21.0,7.0,9.0,38.0,1.0,11.0,22.0,57.0,-2,0,1,0,3
1274,1344,1344,Vasco da Gama,0–0,Fortaleza,Jean Pierre Goncalves Lima,0,0,https://fbref.com/en/matches/02194ba6/Vasco-da...,0.48,0.53,0.82,0.82,0.45,0.28,1.0,1.0,11.0,4.0,23.0,558.0,8.0,14.0,20.0,32.0,1.0,6.0,21.0,40.0,20.0,11.0,26.0,573.0,13.0,10.0,21.0,24.0,1.0,10.0,16.0,47.0,0,0,0,1,1
1275,1345,1345,Bragantino,4–0,Bahia,Anderson Daronco,4,0,https://fbref.com/en/matches/e8c56b12/Braganti...,0.53,0.48,0.88,0.83,0.58,0.13,1.0,0.42,16.0,5.0,12.0,613.0,13.0,13.0,14.0,26.0,1.0,7.0,13.0,37.0,13.0,9.0,27.0,538.0,12.0,7.0,9.0,8.0,0.0,6.0,24.0,45.0,4,1,0,3,0
1276,1346,1346,Flamengo,3–1,Coritiba,Ricardo Marques Ribeiro,3,1,https://fbref.com/en/matches/5d5bae5e/Flamengo...,0.62,0.39,0.88,0.82,0.39,0.17,0.0,0.66,16.0,4.0,12.0,831.0,19.0,12.0,12.0,10.0,0.0,2.0,16.0,55.0,18.0,4.0,13.0,563.0,12.0,8.0,17.0,12.0,1.0,10.0,13.0,48.0,2,1,0,3,0
1277,1347,1347,Atl Paranaense,1–0,Santos,Marcelo de Lima Henrique,1,0,https://fbref.com/en/matches/d15b7fd2/Atletico...,0.6,0.4,0.84,0.82,0.31,0.67,1.0,0.75,22.0,9.0,15.0,686.0,15.0,10.0,24.0,13.0,2.0,7.0,21.0,60.0,13.0,2.0,13.0,542.0,11.0,22.0,10.0,16.0,1.0,8.0,22.0,57.0,1,1,0,3,0


## Feature Engineering

### Function to get the information about the last 3 matches of the team 

In [21]:
# -------- Function to get the information about the last 3 matches of the team ----------
# We need to create 2 functions (Home / Away) because of the Series HT and AT

def home_average_last_3(variavel):
    '''
    Description: Its picks a variable(str, as Fouls, Corners, etc) and generates the 3 last games mean of this variable until the last game played.
    
    Input:
        - Variable (Str)
    Output: 
        - Last 3 games Variable Mean until the new game (Int)    
    
    '''
    media = []
    for i, j in zip(data["Index"],data["HT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["HT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-3:]
        oi = oi[variavel].mean()
        #print(oi)
        media.append(oi)
        
        
    #print(len(media))
    data["last_3_home_avrg_"+variavel] = pd.DataFrame(media)
    
    
def away_average_last_3(variavel):
    '''
    Description: Its picks a variable(str, as Fouls, Corners, etc) and generates the 3 last games mean of this variable until the last game played.
    
    Input:
        - Variable (Str)
    Output: 
        - Last 3 games Variable Mean until the new game (Int)    
    
    '''
    media = []
    for i, j in zip(data["Index"],data["AT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["AT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-3:]
        oi = oi[variavel].mean()
        #print(oi)
        media.append(oi)

    data["last_3_away_avrg_"+variavel] = pd.DataFrame(media)

### Function to get the information about the whole matches of the team

In [22]:
# -------- Function to get the information about the whole matches of the team ----------

def home_average_season(variavel):
    '''
    Description: Its picks a variable(str) and generates the mean of this variable until the last game played.
    
    Input:
        - Variable (Str)
    Output: 
        - Season Mean Variable until the new game (Int)    
    
    '''
    media = []
    for i, j in zip(data["Index"],data["HT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["HT"]==j]
        oi = oi[variavel].mean()
        media.append(oi)

    data["home_avrg_"+variavel] = pd.DataFrame(media)
    

def away_average_season(variavel):
    '''
    Description: Its picks a variable(str) and generates the mean of this variable until the last game played.
    
    Input:
        - Variable (Str)
    Output: 
        - Season Mean Variable until the new game (Int)    
    
    '''
    media = []
    for i, j in zip(data["Index"],data["AT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["AT"]==j]
        oi = oi[variavel].mean()
        media.append(oi)

    data["away_avrg_"+variavel] = pd.DataFrame(media)


### Function to get number of points scored in the last 5 matches

In [23]:
def home_sequence_5():
    '''
    Description: Picks the last 5 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 5 games points    
    
    '''
    sequences = []
    for i, j in zip(data["Index"],data["HT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["HT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-5:]
        oi = oi['points_result_home'].rolling(5, min_periods=1).sum()
        sequences.append(oi.values[-1:])
    data["home_pnts_lst_5"] = pd.DataFrame(sequences)
    
def away_sequence_5():
    '''
    Description: Picks the last 5 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 5 games points    
    
    '''
    sequences = []
    for i, j in zip(data["Index"],data["AT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["AT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-5:]
        oi = oi['points_result_away'].rolling(5, min_periods=1).sum()
        sequences.append(oi.values[-1:])
    data["away_pnts_lst_5"] = pd.DataFrame(sequences)

### Function to get number of points scored in the last 3 matches

In [24]:
def home_sequence_3():
    '''
    Description: Picks the last 3 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 3 games points    
    
    '''
    sequences = []
    for i, j in zip(data["Index"],data["HT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["HT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-3:]
        oi = oi['points_result_home'].rolling(3, min_periods=1).sum()
        sequences.append(oi.values[-1:])
    data["home_pnts_lst_3"] = pd.DataFrame(sequences)

def away_sequence_3():
    '''
    Description: Picks the last 3 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 3 games points    
    
    '''
    sequences = []
    for i, j in zip(data["Index"],data["AT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["AT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-3:]
        oi = oi['points_result_away'].rolling(3, min_periods=1).sum()
        sequences.append(oi.values[-1:])
        #print(sequences)
    data["away_pnts_lst_3"] = pd.DataFrame(sequences)

### Function to get number of points scored in the last match

In [25]:
def home_sequence_1():
    '''
    Description: Picks the last game and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last game points    
    
    '''
    sequences = []
    for i, j in zip(data["Index"],data["HT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["HT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-1:]
        oi = oi['points_result_home'].rolling(1).sum()
        sequences.append(oi.values)
    data["home_pnts_lst_game"] = pd.DataFrame(sequences)
    

def away_sequence_1():
    '''
    Description: Picks the last game and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last game points    
    
    '''
    sequences = []
    for i, j in zip(data["Index"],data["AT"]):
        oi = data.loc[data["Index"]<i]
        oi = oi.loc[oi["AT"]==j]
        oi= oi.reset_index(drop=True)
        oi = oi[-1:]
        oi = oi['points_result_away'].rolling(1).sum()
        sequences.append(oi.values)
    data["away_pnts_lst_game"] = pd.DataFrame(sequences)

In [26]:
# Running the function to get the data

home_features = ['Home_Poss', 'Home_PA', 'Home_ShoT', 'Home_Saves', 'HomeFouls', 'HomeCorners', 'HomeCrosses', 'HomeTouches', 
                 'HomeTackles', 'HomeInterceptions', 'HomeAerials', 'HomeClearances', 'HomeOffsides', 'HomeGoalKicks', 
                 'HomeThrowIns', 'HomeLongBalls'] 
away_features = ['Away_Poss', 'Away_PA', 'Away_ShoT', 'Away_Saves', 'AwayFouls', 'AwayCorners', 'AwayCrosses', 'AwayTouches', 
                 'AwayTackles', 'AwayInterceptions', 'AwayAerials', 'AwayClearances', 'AwayOffsides', 'AwayGoalKicks', 
                 'AwayThrowIns', 'AwayLongBalls']

for i in home_features:
    home_average_season(i)
    home_average_last_3(i)
    
for i in away_features:
    away_average_season(i)
    away_average_last_3(i)

In [27]:
home_sequence_5()
away_sequence_5()
home_sequence_3()
away_sequence_3()
home_sequence_1()
away_sequence_1()

In [28]:
pd.set_option('display.max_columns', 200)
data.head()

Unnamed: 0,index,Index,HT,Score,AT,referee,HomeGoals,AwayGoals,match_report_link,Home_Poss,Away_Poss,Home_PA,Away_PA,Home_ShoT,Away_ShoT,Home_Saves,Away_Saves,HomeFouls,HomeCorners,HomeCrosses,HomeTouches,HomeTackles,HomeInterceptions,HomeAerials,HomeClearances,HomeOffsides,HomeGoalKicks,HomeThrowIns,HomeLongBalls,AwayFouls,AwayCorners,AwayCrosses,AwayTouches,AwayTackles,AwayInterceptions,AwayAerials,AwayClearances,AwayOffsides,AwayGoalKicks,AwayThrowIns,AwayLongBalls,DifferenceGoals,home_win,away_win,points_result_home,points_result_away,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game
0,0,0,Flamengo,1–1,Atlético Mineiro,Jailson Macêdo Freitas,1,1,https://fbref.com/en/matches/d4af3198/Flamengo...,0.42,0.58,0.77,0.82,0.27,0.27,0.25,0.33,7.0,4.0,21.0,537.0,19.0,24.0,10.0,18.0,3.0,10.0,18.0,57.0,10.0,6.0,15.0,701.0,15.0,18.0,10.0,27.0,1.0,11.0,26.0,54.0,0,0,0,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,1,Corinthians,1–1,Chapecoense,Elmo Alves Resende Cunha,1,1,https://fbref.com/en/matches/eaf952cd/Corinthi...,0.67,0.33,0.84,0.74,0.4,0.07,0.0,0.5,10.0,1.0,15.0,810.0,19.0,9.0,14.0,12.0,1.0,10.0,25.0,54.0,12.0,7.0,22.0,496.0,17.0,22.0,17.0,16.0,3.0,4.0,24.0,43.0,0,0,0,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,2,Fluminense,3–2,Santos,Wagner Reway,3,2,https://fbref.com/en/matches/2b4ec09f/Fluminen...,0.38,0.62,0.86,0.89,0.56,0.35,0.71,0.4,15.0,5.0,27.0,483.0,12.0,11.0,12.0,28.0,2.0,10.0,7.0,54.0,18.0,8.0,21.0,684.0,13.0,4.0,12.0,22.0,2.0,5.0,22.0,24.0,1,1,0,3,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,3,Palmeiras,4–0,Vasco da Gama,Rodolpho Toski Marques,4,0,https://fbref.com/en/matches/97072c2b/Palmeira...,0.51,0.49,0.86,0.84,0.56,0.31,1.0,0.55,19.0,3.0,18.0,590.0,16.0,16.0,9.0,18.0,5.0,11.0,13.0,47.0,16.0,2.0,16.0,562.0,18.0,17.0,6.0,11.0,2.0,5.0,22.0,41.0,4,1,0,3,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,4,Bahia,6–2,Atl Paranaense,Flavio Rodrigues De Souza,6,2,https://fbref.com/en/matches/20e0f0ca/Bahia-At...,0.4,0.6,0.81,0.86,0.37,0.42,0.6,0.3,11.0,14.0,30.0,487.0,22.0,7.0,7.0,16.0,4.0,4.0,20.0,44.0,11.0,3.0,10.0,669.0,15.0,12.0,11.0,30.0,1.0,10.0,21.0,35.0,4,1,0,3,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
conditions3 = [
    (data['DifferenceGoals'] > 0),
    (data['DifferenceGoals'] < 0),
    (data['DifferenceGoals'] == 0)
    ]

values3 = [1, 0, 2]

In [30]:
data['result'] = np.select(conditions3, values3)

In [31]:
data['number_of_goals'] = data['HomeGoals'] + data['AwayGoals']
data['number_of_corners'] = data['HomeCorners'] + data['AwayCorners']

In [32]:
data.tail()

Unnamed: 0,index,Index,HT,Score,AT,referee,HomeGoals,AwayGoals,match_report_link,Home_Poss,Away_Poss,Home_PA,Away_PA,Home_ShoT,Away_ShoT,Home_Saves,Away_Saves,HomeFouls,HomeCorners,HomeCrosses,HomeTouches,HomeTackles,HomeInterceptions,HomeAerials,HomeClearances,HomeOffsides,HomeGoalKicks,HomeThrowIns,HomeLongBalls,AwayFouls,AwayCorners,AwayCrosses,AwayTouches,AwayTackles,AwayInterceptions,AwayAerials,AwayClearances,AwayOffsides,AwayGoalKicks,AwayThrowIns,AwayLongBalls,DifferenceGoals,home_win,away_win,points_result_home,points_result_away,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,result,number_of_goals,number_of_corners
1283,1353,1353,Corinthians,0–0,Grêmio,Caio Max Augusto Vieira,0,0,https://fbref.com/en/matches/1764b47f/Corinthi...,0.32,0.68,0.74,0.91,0.14,0.2,1.0,1.0,12.0,4.0,10.0,473.0,20.0,13.0,16.0,21.0,3.0,12.0,10.0,46.0,14.0,12.0,37.0,821.0,10.0,14.0,26.0,15.0,1.0,5.0,19.0,49.0,0,0,0,1,1,0.543548,0.446667,0.833387,0.75,0.347419,0.27,0.736774,0.38,13.629032,20.666667,5.790323,7.333333,21.129032,22.0,672.387097,560.333333,14.435484,11.333333,11.967742,13.0,16.241935,17.0,19.129032,15.666667,1.274194,0.666667,9.241935,7.666667,22.967742,26.0,51.096774,47.333333,0.499839,0.376667,0.821452,0.803333,0.394516,0.246667,0.74129,0.666667,13.725806,13.0,4.145161,4.666667,13.467742,14.0,643.016129,488.333333,16.645161,18.666667,12.048387,14.0,14.612903,8.666667,22.193548,10.0,1.370968,1.666667,9.016129,6.666667,18.870968,19.666667,53.290323,46.666667,5.0,7.0,1.0,7.0,0.0,3.0,2,0,16.0
1284,1354,1354,Sport Recife,0–1,Atl Goianiense,Edina Alves Batista,0,1,https://fbref.com/en/matches/4f786530/Sport-Re...,0.63,0.37,0.85,0.78,0.09,0.33,0.75,1.0,17.0,10.0,39.0,626.0,15.0,12.0,16.0,15.0,1.0,5.0,21.0,43.0,16.0,5.0,17.0,464.0,14.0,23.0,16.0,34.0,2.0,10.0,20.0,45.0,-1,0,1,0,3,0.5175,0.51,0.795,0.79,0.338958,0.31,0.706458,0.646667,15.1875,12.333333,6.145833,2.666667,26.458333,22.0,612.8125,629.0,16.520833,14.0,14.208333,12.0,17.354167,13.0,17.729167,18.0,1.583333,2.0,7.0,7.666667,21.791667,19.666667,48.604167,53.666667,0.441667,0.483333,0.767,0.77,0.337333,0.33,0.650667,0.57,18.0,18.333333,4.6,6.0,19.3,20.0,555.233333,562.0,14.6,12.0,13.966667,8.666667,14.733333,16.666667,19.7,15.0,1.733333,1.333333,9.066667,8.666667,22.5,25.666667,53.466667,58.666667,6.0,5.0,3.0,4.0,0.0,1.0,0,1,15.0
1285,1355,1355,Ceará,1–1,São Paulo,Wagner do Nascimento Magalhaes,1,1,https://fbref.com/en/matches/9e4e8ba1/Ceara-Sa...,0.32,0.68,0.76,0.89,0.55,0.26,0.6,0.83,20.0,5.0,14.0,477.0,16.0,20.0,13.0,18.0,2.0,9.0,20.0,50.0,13.0,5.0,21.0,834.0,9.0,12.0,16.0,10.0,1.0,5.0,12.0,36.0,0,0,0,1,1,0.463261,0.456667,0.796087,0.766667,0.312609,0.316667,0.721304,0.776667,14.891304,16.666667,6.652174,5.333333,22.521739,21.0,569.152174,551.0,15.934783,12.666667,10.913043,18.0,16.717391,14.333333,18.847826,17.666667,1.673913,2.333333,7.673913,8.0,20.326087,20.0,50.26087,53.666667,0.525231,0.5,0.818769,0.84,0.324923,0.52,0.704308,0.783333,15.784615,17.666667,4.676923,3.666667,18.323077,14.0,632.830769,599.0,16.015385,17.666667,11.476923,8.333333,17.046154,13.666667,21.892308,20.666667,1.446154,0.666667,8.369231,6.333333,21.415385,16.666667,46.723077,46.0,9.0,11.0,5.0,9.0,1.0,3.0,2,2,10.0
1286,1356,1356,Atlético Mineiro,2–1,Botafogo (RJ),Savio Pereira,2,1,https://fbref.com/en/matches/ab6f4914/Atletico...,0.54,0.46,0.87,0.82,0.55,0.75,0.66,0.66,25.0,10.0,21.0,631.0,11.0,15.0,12.0,6.0,2.0,6.0,15.0,33.0,19.0,2.0,7.0,548.0,14.0,9.0,9.0,12.0,4.0,4.0,18.0,67.0,1,1,0,3,0,0.562812,0.53,0.826562,0.816667,0.36625,0.463333,0.691094,0.773333,16.0625,13.0,7.125,6.666667,24.0,21.0,671.75,651.0,17.25,17.666667,12.15625,14.666667,15.21875,9.333333,16.46875,14.333333,1.71875,0.666667,7.078125,6.333333,22.90625,22.0,53.234375,62.666667,0.466271,0.43,0.792542,0.8,0.314407,0.283333,0.702203,0.493333,13.627119,15.0,3.915254,5.333333,16.610169,20.0,582.050847,550.666667,15.067797,12.666667,12.661017,11.0,15.966102,13.0,25.525424,18.0,1.084746,0.333333,10.542373,11.666667,19.576271,17.666667,54.728814,47.333333,10.0,5.0,4.0,3.0,0.0,0.0,1,3,12.0
1287,1357,1357,Coritiba,0–1,Corinthians,Leandro Pedro Vuaden,0,1,https://fbref.com/en/matches/b729e3e6/Coritiba...,0.53,0.47,0.82,0.78,0.27,0.17,0.5,1.0,13.0,4.0,25.0,657.0,11.0,13.0,24.0,17.0,1.0,7.0,22.0,59.0,25.0,3.0,18.0,593.0,12.0,13.0,22.0,20.0,2.0,10.0,23.0,69.0,-1,0,1,0,3,0.482222,0.453333,0.78037,0.803333,0.319259,0.223333,0.689259,0.533333,16.148148,17.333333,4.851852,5.333333,19.666667,16.666667,565.222222,538.666667,16.62963,13.333333,13.296296,10.0,13.814815,10.333333,23.407407,16.333333,1.777778,0.666667,8.851852,8.666667,22.62963,21.0,57.740741,49.333333,0.521231,0.436667,0.826923,0.74,0.354615,0.353333,0.747385,0.843333,13.676923,19.0,4.446154,3.666667,17.446154,17.333333,634.4,547.666667,14.753846,13.666667,10.969231,11.333333,16.969231,17.666667,22.661538,23.666667,1.353846,1.666667,10.169231,8.666667,21.4,25.666667,53.661538,58.666667,5.0,8.0,3.0,7.0,0.0,1.0,0,1,7.0


In [33]:
data.shape

(1288, 119)

In [34]:
data.to_csv('featured_data.csv')

# Preprocessing and Models

## Preprocessing features

In [35]:
import pandas as pd

In [36]:
Home_teams_encoded = pd.get_dummies(data['HT'], dtype=np.int64)
Away_teams_encoded = pd.get_dummies(data['AT'], dtype=np.int64)

In [37]:
Home_teams_encoded.columns = [str(col) + '_HT' for col in Home_teams_encoded.columns]
Away_teams_encoded.columns = [str(col) + '_AT' for col in Away_teams_encoded.columns]

In [38]:
Home_teams_encoded.head()

Unnamed: 0,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
Away_teams_encoded.head()

Unnamed: 0,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
Away_teams_encoded.shape

(1288, 28)

In [41]:
data.columns

Index(['index', 'Index', 'HT', 'Score', 'AT', 'referee', 'HomeGoals',
       'AwayGoals', 'match_report_link', 'Home_Poss',
       ...
       'last_3_away_avrg_AwayLongBalls', 'home_pnts_lst_5', 'away_pnts_lst_5',
       'home_pnts_lst_3', 'away_pnts_lst_3', 'home_pnts_lst_game',
       'away_pnts_lst_game', 'result', 'number_of_goals', 'number_of_corners'],
      dtype='object', length=119)

In [42]:
# In this case, clean also the target you are not going to use

clean_data_result = data.drop(['index', 'Index', 'HT', 'Score', 'AT', 'referee', 'HomeGoals', 'AwayGoals', 'match_report_link',
                               'Home_Poss', 'Away_Poss', 'Home_PA', 'Away_PA', 'Home_ShoT', 'Away_ShoT', 'Home_Saves', 
                               'Away_Saves', 'HomeFouls', 'HomeCorners', 'HomeCrosses', 'HomeTouches', 'HomeTackles', 
                               'HomeInterceptions', 'HomeAerials', 'HomeClearances', 'HomeOffsides', 'HomeGoalKicks', 
                               'HomeThrowIns', 'HomeLongBalls', 'AwayFouls', 'AwayCorners', 'AwayCrosses', 'AwayTouches',
                               'AwayTackles', 'AwayInterceptions', 'AwayAerials', 'AwayClearances', 'AwayOffsides', 
                               'AwayGoalKicks', 'AwayThrowIns', 'AwayLongBalls', 'DifferenceGoals', 'home_win', 'away_win', 
                               'points_result_home', 'points_result_away'], axis=1)

## Dummifying teams and dropping nans

In [43]:
a = clean_data_result.columns[:-3]
a

Index(['home_avrg_Home_Poss', 'last_3_home_avrg_Home_Poss',
       'home_avrg_Home_PA', 'last_3_home_avrg_Home_PA', 'home_avrg_Home_ShoT',
       'last_3_home_avrg_Home_ShoT', 'home_avrg_Home_Saves',
       'last_3_home_avrg_Home_Saves', 'home_avrg_HomeFouls',
       'last_3_home_avrg_HomeFouls', 'home_avrg_HomeCorners',
       'last_3_home_avrg_HomeCorners', 'home_avrg_HomeCrosses',
       'last_3_home_avrg_HomeCrosses', 'home_avrg_HomeTouches',
       'last_3_home_avrg_HomeTouches', 'home_avrg_HomeTackles',
       'last_3_home_avrg_HomeTackles', 'home_avrg_HomeInterceptions',
       'last_3_home_avrg_HomeInterceptions', 'home_avrg_HomeAerials',
       'last_3_home_avrg_HomeAerials', 'home_avrg_HomeClearances',
       'last_3_home_avrg_HomeClearances', 'home_avrg_HomeOffsides',
       'last_3_home_avrg_HomeOffsides', 'home_avrg_HomeGoalKicks',
       'last_3_home_avrg_HomeGoalKicks', 'home_avrg_HomeThrowIns',
       'last_3_home_avrg_HomeThrowIns', 'home_avrg_HomeLongBalls',
       'l

In [44]:
b = Home_teams_encoded.columns
c = Away_teams_encoded.columns

In [45]:
clean_data_result.tail()

Unnamed: 0,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,result,number_of_goals,number_of_corners
1283,0.543548,0.446667,0.833387,0.75,0.347419,0.27,0.736774,0.38,13.629032,20.666667,5.790323,7.333333,21.129032,22.0,672.387097,560.333333,14.435484,11.333333,11.967742,13.0,16.241935,17.0,19.129032,15.666667,1.274194,0.666667,9.241935,7.666667,22.967742,26.0,51.096774,47.333333,0.499839,0.376667,0.821452,0.803333,0.394516,0.246667,0.74129,0.666667,13.725806,13.0,4.145161,4.666667,13.467742,14.0,643.016129,488.333333,16.645161,18.666667,12.048387,14.0,14.612903,8.666667,22.193548,10.0,1.370968,1.666667,9.016129,6.666667,18.870968,19.666667,53.290323,46.666667,5.0,7.0,1.0,7.0,0.0,3.0,2,0,16.0
1284,0.5175,0.51,0.795,0.79,0.338958,0.31,0.706458,0.646667,15.1875,12.333333,6.145833,2.666667,26.458333,22.0,612.8125,629.0,16.520833,14.0,14.208333,12.0,17.354167,13.0,17.729167,18.0,1.583333,2.0,7.0,7.666667,21.791667,19.666667,48.604167,53.666667,0.441667,0.483333,0.767,0.77,0.337333,0.33,0.650667,0.57,18.0,18.333333,4.6,6.0,19.3,20.0,555.233333,562.0,14.6,12.0,13.966667,8.666667,14.733333,16.666667,19.7,15.0,1.733333,1.333333,9.066667,8.666667,22.5,25.666667,53.466667,58.666667,6.0,5.0,3.0,4.0,0.0,1.0,0,1,15.0
1285,0.463261,0.456667,0.796087,0.766667,0.312609,0.316667,0.721304,0.776667,14.891304,16.666667,6.652174,5.333333,22.521739,21.0,569.152174,551.0,15.934783,12.666667,10.913043,18.0,16.717391,14.333333,18.847826,17.666667,1.673913,2.333333,7.673913,8.0,20.326087,20.0,50.26087,53.666667,0.525231,0.5,0.818769,0.84,0.324923,0.52,0.704308,0.783333,15.784615,17.666667,4.676923,3.666667,18.323077,14.0,632.830769,599.0,16.015385,17.666667,11.476923,8.333333,17.046154,13.666667,21.892308,20.666667,1.446154,0.666667,8.369231,6.333333,21.415385,16.666667,46.723077,46.0,9.0,11.0,5.0,9.0,1.0,3.0,2,2,10.0
1286,0.562812,0.53,0.826562,0.816667,0.36625,0.463333,0.691094,0.773333,16.0625,13.0,7.125,6.666667,24.0,21.0,671.75,651.0,17.25,17.666667,12.15625,14.666667,15.21875,9.333333,16.46875,14.333333,1.71875,0.666667,7.078125,6.333333,22.90625,22.0,53.234375,62.666667,0.466271,0.43,0.792542,0.8,0.314407,0.283333,0.702203,0.493333,13.627119,15.0,3.915254,5.333333,16.610169,20.0,582.050847,550.666667,15.067797,12.666667,12.661017,11.0,15.966102,13.0,25.525424,18.0,1.084746,0.333333,10.542373,11.666667,19.576271,17.666667,54.728814,47.333333,10.0,5.0,4.0,3.0,0.0,0.0,1,3,12.0
1287,0.482222,0.453333,0.78037,0.803333,0.319259,0.223333,0.689259,0.533333,16.148148,17.333333,4.851852,5.333333,19.666667,16.666667,565.222222,538.666667,16.62963,13.333333,13.296296,10.0,13.814815,10.333333,23.407407,16.333333,1.777778,0.666667,8.851852,8.666667,22.62963,21.0,57.740741,49.333333,0.521231,0.436667,0.826923,0.74,0.354615,0.353333,0.747385,0.843333,13.676923,19.0,4.446154,3.666667,17.446154,17.333333,634.4,547.666667,14.753846,13.666667,10.969231,11.333333,16.969231,17.666667,22.661538,23.666667,1.353846,1.666667,10.169231,8.666667,21.4,25.666667,53.661538,58.666667,5.0,8.0,3.0,7.0,0.0,1.0,0,1,7.0


In [46]:
Home_teams_encoded.head()

Unnamed: 0,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
Away_teams_encoded.head()

Unnamed: 0,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [48]:
frames = [clean_data_result, Home_teams_encoded, Away_teams_encoded]

In [49]:
result = pd.concat(frames, axis=1)

In [50]:
result.reset_index(inplace=True, drop=True)

In [51]:
result.tail()

Unnamed: 0,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,result,number_of_goals,number_of_corners,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT
1283,0.543548,0.446667,0.833387,0.75,0.347419,0.27,0.736774,0.38,13.629032,20.666667,5.790323,7.333333,21.129032,22.0,672.387097,560.333333,14.435484,11.333333,11.967742,13.0,16.241935,17.0,19.129032,15.666667,1.274194,0.666667,9.241935,7.666667,22.967742,26.0,51.096774,47.333333,0.499839,0.376667,0.821452,0.803333,0.394516,0.246667,0.74129,0.666667,13.725806,13.0,4.145161,4.666667,13.467742,14.0,643.016129,488.333333,16.645161,18.666667,12.048387,14.0,14.612903,8.666667,22.193548,10.0,1.370968,1.666667,9.016129,6.666667,18.870968,19.666667,53.290323,46.666667,5.0,7.0,1.0,7.0,0.0,3.0,2,0,16.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1284,0.5175,0.51,0.795,0.79,0.338958,0.31,0.706458,0.646667,15.1875,12.333333,6.145833,2.666667,26.458333,22.0,612.8125,629.0,16.520833,14.0,14.208333,12.0,17.354167,13.0,17.729167,18.0,1.583333,2.0,7.0,7.666667,21.791667,19.666667,48.604167,53.666667,0.441667,0.483333,0.767,0.77,0.337333,0.33,0.650667,0.57,18.0,18.333333,4.6,6.0,19.3,20.0,555.233333,562.0,14.6,12.0,13.966667,8.666667,14.733333,16.666667,19.7,15.0,1.733333,1.333333,9.066667,8.666667,22.5,25.666667,53.466667,58.666667,6.0,5.0,3.0,4.0,0.0,1.0,0,1,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1285,0.463261,0.456667,0.796087,0.766667,0.312609,0.316667,0.721304,0.776667,14.891304,16.666667,6.652174,5.333333,22.521739,21.0,569.152174,551.0,15.934783,12.666667,10.913043,18.0,16.717391,14.333333,18.847826,17.666667,1.673913,2.333333,7.673913,8.0,20.326087,20.0,50.26087,53.666667,0.525231,0.5,0.818769,0.84,0.324923,0.52,0.704308,0.783333,15.784615,17.666667,4.676923,3.666667,18.323077,14.0,632.830769,599.0,16.015385,17.666667,11.476923,8.333333,17.046154,13.666667,21.892308,20.666667,1.446154,0.666667,8.369231,6.333333,21.415385,16.666667,46.723077,46.0,9.0,11.0,5.0,9.0,1.0,3.0,2,2,10.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1286,0.562812,0.53,0.826562,0.816667,0.36625,0.463333,0.691094,0.773333,16.0625,13.0,7.125,6.666667,24.0,21.0,671.75,651.0,17.25,17.666667,12.15625,14.666667,15.21875,9.333333,16.46875,14.333333,1.71875,0.666667,7.078125,6.333333,22.90625,22.0,53.234375,62.666667,0.466271,0.43,0.792542,0.8,0.314407,0.283333,0.702203,0.493333,13.627119,15.0,3.915254,5.333333,16.610169,20.0,582.050847,550.666667,15.067797,12.666667,12.661017,11.0,15.966102,13.0,25.525424,18.0,1.084746,0.333333,10.542373,11.666667,19.576271,17.666667,54.728814,47.333333,10.0,5.0,4.0,3.0,0.0,0.0,1,3,12.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1287,0.482222,0.453333,0.78037,0.803333,0.319259,0.223333,0.689259,0.533333,16.148148,17.333333,4.851852,5.333333,19.666667,16.666667,565.222222,538.666667,16.62963,13.333333,13.296296,10.0,13.814815,10.333333,23.407407,16.333333,1.777778,0.666667,8.851852,8.666667,22.62963,21.0,57.740741,49.333333,0.521231,0.436667,0.826923,0.74,0.354615,0.353333,0.747385,0.843333,13.676923,19.0,4.446154,3.666667,17.446154,17.333333,634.4,547.666667,14.753846,13.666667,10.969231,11.333333,16.969231,17.666667,22.661538,23.666667,1.353846,1.666667,10.169231,8.666667,21.4,25.666667,53.661538,58.666667,5.0,8.0,3.0,7.0,0.0,1.0,0,1,7.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
result.shape

(1288, 129)

In [53]:
result = result.dropna()
result.reset_index(inplace=True, drop=True)

In [54]:
result.shape

(1248, 129)

In [55]:
result.head()

Unnamed: 0,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,result,number_of_goals,number_of_corners,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT
0,0.6,0.6,0.76,0.76,0.63,0.63,0.66,0.66,13.0,13.0,10.0,10.0,28.0,28.0,615.0,615.0,33.0,33.0,16.0,16.0,17.0,17.0,8.0,8.0,3.0,3.0,11.0,11.0,27.0,27.0,57.0,57.0,0.4,0.4,0.73,0.73,0.33,0.33,0.66,0.66,23.0,23.0,5.0,5.0,14.0,14.0,503.0,503.0,11.0,11.0,19.0,19.0,8.0,8.0,40.0,40.0,4.0,4.0,14.0,14.0,18.0,18.0,72.0,72.0,3.0,3.0,3.0,3.0,3.0,3.0,1,5,10.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0.47,0.47,0.84,0.84,0.23,0.23,0.75,0.75,13.0,13.0,3.0,3.0,20.0,20.0,596.0,596.0,15.0,15.0,19.0,19.0,15.0,15.0,17.0,17.0,0.0,0.0,5.0,5.0,22.0,22.0,74.0,74.0,0.53,0.53,0.8,0.8,0.48,0.48,0.66,0.66,17.0,17.0,4.0,4.0,19.0,19.0,578.0,578.0,10.0,10.0,18.0,18.0,10.0,10.0,12.0,12.0,2.0,2.0,7.0,7.0,21.0,21.0,55.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.6,0.6,0.81,0.81,0.16,0.16,0.6,0.6,18.0,18.0,12.0,12.0,43.0,43.0,644.0,644.0,17.0,17.0,16.0,16.0,17.0,17.0,10.0,10.0,3.0,3.0,6.0,6.0,24.0,24.0,63.0,63.0,0.6,0.6,0.89,0.89,0.29,0.29,0.66,0.66,12.0,12.0,6.0,6.0,30.0,30.0,834.0,834.0,13.0,13.0,11.0,11.0,9.0,9.0,8.0,8.0,1.0,1.0,4.0,4.0,21.0,21.0,47.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,2,4,12.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.47,0.47,0.77,0.77,0.43,0.43,1.0,1.0,24.0,24.0,3.0,3.0,15.0,15.0,542.0,542.0,16.0,16.0,16.0,16.0,12.0,12.0,27.0,27.0,1.0,1.0,12.0,12.0,24.0,24.0,50.0,50.0,0.56,0.56,0.77,0.77,0.29,0.29,0.8,0.8,14.0,14.0,7.0,7.0,19.0,19.0,638.0,638.0,16.0,16.0,11.0,11.0,15.0,15.0,15.0,15.0,2.0,2.0,10.0,10.0,23.0,23.0,52.0,52.0,3.0,1.0,3.0,1.0,3.0,1.0,0,1,15.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.44,0.44,0.75,0.75,0.5,0.5,0.8,0.8,20.0,20.0,5.0,5.0,14.0,14.0,554.0,554.0,13.0,13.0,26.0,26.0,23.0,23.0,25.0,25.0,1.0,1.0,11.0,11.0,17.0,17.0,49.0,49.0,0.4,0.4,0.81,0.81,0.67,0.67,1.0,1.0,16.0,16.0,3.0,3.0,20.0,20.0,533.0,533.0,20.0,20.0,13.0,13.0,13.0,13.0,21.0,21.0,3.0,3.0,12.0,12.0,16.0,16.0,55.0,55.0,1.0,3.0,1.0,3.0,1.0,3.0,1,7,18.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [56]:
result.to_csv('dataset_for_all_features.csv')

## Scalling and Running the model

In [57]:
from sklearn.preprocessing import MinMaxScaler

norm = MinMaxScaler()
norm_features = norm.fit_transform(result[a])
x = pd.DataFrame(norm_features, columns=a)

In [58]:
final_data_norm = pd.concat([x, result[b], result[c]], axis=1)

In [59]:
final_data_norm.isnull().sum()

home_avrg_Home_Poss           0
last_3_home_avrg_Home_Poss    0
home_avrg_Home_PA             0
last_3_home_avrg_Home_PA      0
home_avrg_Home_ShoT           0
                             ..
Santos_AT                     0
Sport Recife_AT               0
São Paulo_AT                  0
Vasco da Gama_AT              0
Vitória_AT                    0
Length: 126, dtype: int64

In [60]:
final_data_norm.head()

Unnamed: 0,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT
0,0.815789,0.726562,0.409091,0.391892,1.0,0.961538,0.66,0.66,0.266667,0.410714,0.692308,0.692308,0.516129,0.504425,0.418224,0.46946,1.0,1.0,0.409091,0.48,0.684211,0.448276,0.041667,0.10101,0.6,0.6,0.692308,0.692308,0.774194,0.638298,0.630435,0.566038,0.305556,0.268293,0.190476,0.346154,0.433333,0.40625,0.66,0.66,0.904762,0.890625,0.416667,0.416667,0.219512,0.3125,0.23908,0.312327,0.277778,0.217391,0.810811,0.625,0.182927,0.130435,1.0,0.69863,1.0,0.705882,0.836735,0.780488,0.304348,0.407407,0.842105,0.842105,0.2,0.2,0.333333,0.333333,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0.473684,0.421875,0.772727,0.716216,0.2,0.192308,0.75,0.75,0.266667,0.410714,0.153846,0.153846,0.258065,0.292035,0.373832,0.428977,0.28,0.325,0.545455,0.6,0.578947,0.37931,0.416667,0.373737,0.0,0.0,0.230769,0.230769,0.580645,0.478723,1.0,0.886792,0.666667,0.585366,0.52381,0.615385,0.683333,0.640625,0.66,0.66,0.619048,0.609375,0.333333,0.333333,0.463415,0.5,0.411494,0.468144,0.222222,0.173913,0.756757,0.583333,0.304878,0.217391,0.176471,0.123288,0.5,0.352941,0.265306,0.268293,0.434783,0.518519,0.54386,0.54386,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.815789,0.726562,0.636364,0.594595,0.06,0.057692,0.6,0.6,0.6,0.678571,0.846154,0.846154,1.0,0.902655,0.485981,0.53125,0.36,0.4,0.409091,0.48,0.684211,0.448276,0.125,0.161616,0.6,0.6,0.307692,0.307692,0.658065,0.542553,0.76087,0.679245,0.861111,0.756098,0.952381,0.961538,0.366667,0.34375,0.66,0.66,0.380952,0.375,0.5,0.5,1.0,0.9125,1.0,1.0,0.388889,0.304348,0.378378,0.291667,0.243902,0.173913,0.058824,0.041096,0.25,0.176471,0.020408,0.04878,0.434783,0.518519,0.403509,0.403509,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0.473684,0.421875,0.454545,0.432432,0.6,0.576923,1.0,1.0,1.0,1.0,0.153846,0.153846,0.096774,0.159292,0.247664,0.31392,0.32,0.3625,0.409091,0.48,0.421053,0.275862,0.833333,0.676768,0.2,0.2,0.769231,0.769231,0.658065,0.542553,0.478261,0.433962,0.75,0.658537,0.380952,0.5,0.366667,0.34375,0.8,0.8,0.47619,0.46875,0.583333,0.583333,0.463415,0.5,0.549425,0.592798,0.555556,0.434783,0.378378,0.291667,0.609756,0.434783,0.264706,0.184932,0.5,0.352941,0.510204,0.487805,0.521739,0.592593,0.491228,0.491228,0.2,0.066667,0.333333,0.111111,1.0,0.333333,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.394737,0.351562,0.363636,0.351351,0.74,0.711538,0.8,0.8,0.733333,0.785714,0.307692,0.307692,0.064516,0.132743,0.275701,0.339489,0.2,0.25,0.863636,0.88,1.0,0.655172,0.75,0.616162,0.2,0.2,0.692308,0.692308,0.387097,0.319149,0.456522,0.415094,0.305556,0.268293,0.571429,0.653846,1.0,0.9375,1.0,1.0,0.571429,0.5625,0.25,0.25,0.512195,0.5375,0.308046,0.374654,0.777778,0.608696,0.486486,0.375,0.487805,0.347826,0.441176,0.308219,0.75,0.529412,0.673469,0.634146,0.217391,0.333333,0.54386,0.54386,0.066667,0.2,0.111111,0.333333,0.333333,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [61]:
X = final_data_norm
y = result['result']

In [62]:
y

0       1
1       0
2       2
3       0
4       1
       ..
1243    2
1244    0
1245    2
1246    1
1247    0
Name: result, Length: 1248, dtype: int64

In [63]:
# We dont need to encode. It's already encoded!!

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
result['encoded_result'] = le.fit_transform(result['result'])
result['encoded_result']

0       1
1       0
2       2
3       0
4       1
       ..
1243    2
1244    0
1245    2
1246    1
1247    0
Name: encoded_result, Length: 1248, dtype: int64

In [64]:
final_final = pd.concat([final_data_norm, y], axis=1)

In [65]:
final_final.head()

Unnamed: 0,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT,result
0,0.815789,0.726562,0.409091,0.391892,1.0,0.961538,0.66,0.66,0.266667,0.410714,0.692308,0.692308,0.516129,0.504425,0.418224,0.46946,1.0,1.0,0.409091,0.48,0.684211,0.448276,0.041667,0.10101,0.6,0.6,0.692308,0.692308,0.774194,0.638298,0.630435,0.566038,0.305556,0.268293,0.190476,0.346154,0.433333,0.40625,0.66,0.66,0.904762,0.890625,0.416667,0.416667,0.219512,0.3125,0.23908,0.312327,0.277778,0.217391,0.810811,0.625,0.182927,0.130435,1.0,0.69863,1.0,0.705882,0.836735,0.780488,0.304348,0.407407,0.842105,0.842105,0.2,0.2,0.333333,0.333333,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0.473684,0.421875,0.772727,0.716216,0.2,0.192308,0.75,0.75,0.266667,0.410714,0.153846,0.153846,0.258065,0.292035,0.373832,0.428977,0.28,0.325,0.545455,0.6,0.578947,0.37931,0.416667,0.373737,0.0,0.0,0.230769,0.230769,0.580645,0.478723,1.0,0.886792,0.666667,0.585366,0.52381,0.615385,0.683333,0.640625,0.66,0.66,0.619048,0.609375,0.333333,0.333333,0.463415,0.5,0.411494,0.468144,0.222222,0.173913,0.756757,0.583333,0.304878,0.217391,0.176471,0.123288,0.5,0.352941,0.265306,0.268293,0.434783,0.518519,0.54386,0.54386,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.815789,0.726562,0.636364,0.594595,0.06,0.057692,0.6,0.6,0.6,0.678571,0.846154,0.846154,1.0,0.902655,0.485981,0.53125,0.36,0.4,0.409091,0.48,0.684211,0.448276,0.125,0.161616,0.6,0.6,0.307692,0.307692,0.658065,0.542553,0.76087,0.679245,0.861111,0.756098,0.952381,0.961538,0.366667,0.34375,0.66,0.66,0.380952,0.375,0.5,0.5,1.0,0.9125,1.0,1.0,0.388889,0.304348,0.378378,0.291667,0.243902,0.173913,0.058824,0.041096,0.25,0.176471,0.020408,0.04878,0.434783,0.518519,0.403509,0.403509,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2
3,0.473684,0.421875,0.454545,0.432432,0.6,0.576923,1.0,1.0,1.0,1.0,0.153846,0.153846,0.096774,0.159292,0.247664,0.31392,0.32,0.3625,0.409091,0.48,0.421053,0.275862,0.833333,0.676768,0.2,0.2,0.769231,0.769231,0.658065,0.542553,0.478261,0.433962,0.75,0.658537,0.380952,0.5,0.366667,0.34375,0.8,0.8,0.47619,0.46875,0.583333,0.583333,0.463415,0.5,0.549425,0.592798,0.555556,0.434783,0.378378,0.291667,0.609756,0.434783,0.264706,0.184932,0.5,0.352941,0.510204,0.487805,0.521739,0.592593,0.491228,0.491228,0.2,0.066667,0.333333,0.111111,1.0,0.333333,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.394737,0.351562,0.363636,0.351351,0.74,0.711538,0.8,0.8,0.733333,0.785714,0.307692,0.307692,0.064516,0.132743,0.275701,0.339489,0.2,0.25,0.863636,0.88,1.0,0.655172,0.75,0.616162,0.2,0.2,0.692308,0.692308,0.387097,0.319149,0.456522,0.415094,0.305556,0.268293,0.571429,0.653846,1.0,0.9375,1.0,1.0,0.571429,0.5625,0.25,0.25,0.512195,0.5375,0.308046,0.374654,0.777778,0.608696,0.486486,0.375,0.487805,0.347826,0.441176,0.308219,0.75,0.529412,0.673469,0.634146,0.217391,0.333333,0.54386,0.54386,0.066667,0.2,0.111111,0.333333,0.333333,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [66]:
final_final.to_csv('featured_final_data.csv')

# Running the model

In [138]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


X = final_data_norm

### LOGISTIC REGRESSION FOR THE OUTCOME


In [142]:
y_result = result['result']

X_train, X_test, y_result_train, y_result_test = train_test_split(X, y_result, test_size=0.30)

In [143]:
model_lr_outcome = LogisticRegression(C = 0.1)

model_lr_outcome.fit(X_train, y_result_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### LOGISTIC REGRESSION FOR THE NUMBER OF CORNERS

In [72]:
data_corners = result.copy()

In [73]:
data_corners['over_8.5'] = np.where(data_corners['number_of_corners'] > 8.5, 1, 0)
data_corners['over_9.5'] = np.where(data_corners['number_of_corners'] > 9.5, 1, 0)
data_corners['over_10.5'] = np.where(data_corners['number_of_corners'] > 10.5, 1, 0)
data_corners['over_11.5'] = np.where(data_corners['number_of_corners'] > 11.5, 1, 0)
data_corners['over_12.5'] = np.where(data_corners['number_of_corners'] > 12.5, 1, 0)
data_corners['over_13.5'] = np.where(data_corners['number_of_corners'] > 13.5, 1, 0)

y_cor_85 = data_corners['over_8.5']
y_cor_95 = data_corners['over_9.5']
y_cor_105 = data_corners['over_10.5']
y_cor_115 = data_corners['over_11.5']
y_cor_125 = data_corners['over_12.5']
y_cor_135 = data_corners['over_13.5']

# train and test data for each model
X_train, X_test, y_cor_85_train, y_cor_85_test = train_test_split(X, y_cor_85, test_size=0.30)
X_train, X_test, y_cor_95_train, y_cor_95_test = train_test_split(X, y_cor_95, test_size=0.30)
X_train, X_test, y_cor_105_train, y_cor_105_test = train_test_split(X, y_cor_105, test_size=0.30)
X_train, X_test, y_cor_115_train, y_cor_115_test = train_test_split(X, y_cor_115, test_size=0.30)
X_train, X_test, y_cor_125_train, y_cor_125_test = train_test_split(X, y_cor_125, test_size=0.30)
X_train, X_test, y_cor_135_train, y_cor_135_test = train_test_split(X, y_cor_135, test_size=0.30)

In [122]:
# instantiate and fit each model
model_lr_cor_85 = LogisticRegression(max_iter=500)
model_lr_cor_85.fit(X_train, y_cor_85_train)

model_lr_cor_95 = LogisticRegression(max_iter=500)
model_lr_cor_95.fit(X_train, y_cor_95_train)

model_lr_cor_105 = LogisticRegression(max_iter=500)
model_lr_cor_105.fit(X_train, y_cor_105_train)

model_lr_cor_115 = LogisticRegression(max_iter=500)
model_lr_cor_115.fit(X_train, y_cor_115_train)

model_lr_cor_125 = LogisticRegression(max_iter=500)
model_lr_cor_125.fit(X_train, y_cor_125_train)

model_lr_cor_135 = LogisticRegression(max_iter=500)
model_lr_cor_135.fit(X_train, y_cor_135_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### LOGISTIC REGRESSION FOR THE NUMBER OF GOALS

In [123]:
data_goals = result.copy()

In [125]:
data_goals['over_0.5'] = np.where(data_goals['number_of_goals'] > 0.5, 1, 0)
data_goals['over_1.5'] = np.where(data_goals['number_of_goals'] > 1.5, 1, 0)
data_goals['over_2.5'] = np.where(data_goals['number_of_goals'] > 2.5, 1, 0)
data_goals['over_3.5'] = np.where(data_goals['number_of_goals'] > 3.5, 1, 0)
data_goals['over_4.5'] = np.where(data_goals['number_of_goals'] > 4.5, 1, 0)

In [126]:
y_g_05 = data_goals['over_0.5']
y_g_15 = data_goals['over_1.5']
y_g_25 = data_goals['over_2.5']
y_g_35 = data_goals['over_3.5'] 
y_g_45 = data_goals['over_4.5'] 

In [127]:
# train and test data for each model
X_train, X_test, y_g_05_train, y_g_05_test = train_test_split(X, y_g_05, test_size=0.30)
X_train, X_test, y_g_15_train, y_g_15_test = train_test_split(X, y_g_15, test_size=0.30)
X_train, X_test, y_g_25_train, y_g_25_test = train_test_split(X, y_g_25, test_size=0.30)
X_train, X_test, y_g_35_train, y_g_35_test = train_test_split(X, y_g_35, test_size=0.30)
X_train, X_test, y_g_45_train, y_g_45_test = train_test_split(X, y_g_45, test_size=0.30)

In [129]:
# instantiate and fit each model
model_lr_g_05 = LogisticRegression(max_iter=500)
model_lr_g_05.fit(X_train, y_g_05_train)

model_lr_g_15 = LogisticRegression(max_iter=500)
model_lr_g_15.fit(X_train, y_g_15_train)

model_lr_g_25 = LogisticRegression(max_iter=500)
model_lr_g_25.fit(X_train, y_g_25_train)

model_lr_g_35 = LogisticRegression(max_iter=500)
model_lr_g_35.fit(X_train, y_g_35_train)

model_lr_g_45 = LogisticRegression(max_iter=500)
model_lr_g_45.fit(X_train, y_g_45_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

------------------------------------------------------------------------------------------------------------------------

### SAMPLE INPUT DATA

In [131]:
input_df = pd.read_csv('data_to_predict_SP.csv')

In [132]:
input_df.drop(columns=['Unnamed: 0'], inplace=True)

In [133]:
# PROBABILITY OF PARTICULAR OUTCOMES
model_lr_outcome.predict_proba(input_df)

array([[0.13847963, 0.61817519, 0.24334518]])

In [134]:
input_df

Unnamed: 0,home_avrg_Home_Poss,last_3_home_avrg_Home_Poss,home_avrg_Home_PA,last_3_home_avrg_Home_PA,home_avrg_Home_ShoT,last_3_home_avrg_Home_ShoT,home_avrg_Home_Saves,last_3_home_avrg_Home_Saves,home_avrg_HomeFouls,last_3_home_avrg_HomeFouls,home_avrg_HomeCorners,last_3_home_avrg_HomeCorners,home_avrg_HomeCrosses,last_3_home_avrg_HomeCrosses,home_avrg_HomeTouches,last_3_home_avrg_HomeTouches,home_avrg_HomeTackles,last_3_home_avrg_HomeTackles,home_avrg_HomeInterceptions,last_3_home_avrg_HomeInterceptions,home_avrg_HomeAerials,last_3_home_avrg_HomeAerials,home_avrg_HomeClearances,last_3_home_avrg_HomeClearances,home_avrg_HomeOffsides,last_3_home_avrg_HomeOffsides,home_avrg_HomeGoalKicks,last_3_home_avrg_HomeGoalKicks,home_avrg_HomeThrowIns,last_3_home_avrg_HomeThrowIns,home_avrg_HomeLongBalls,last_3_home_avrg_HomeLongBalls,away_avrg_Away_Poss,last_3_away_avrg_Away_Poss,away_avrg_Away_PA,last_3_away_avrg_Away_PA,away_avrg_Away_ShoT,last_3_away_avrg_Away_ShoT,away_avrg_Away_Saves,last_3_away_avrg_Away_Saves,away_avrg_AwayFouls,last_3_away_avrg_AwayFouls,away_avrg_AwayCorners,last_3_away_avrg_AwayCorners,away_avrg_AwayCrosses,last_3_away_avrg_AwayCrosses,away_avrg_AwayTouches,last_3_away_avrg_AwayTouches,away_avrg_AwayTackles,last_3_away_avrg_AwayTackles,away_avrg_AwayInterceptions,last_3_away_avrg_AwayInterceptions,away_avrg_AwayAerials,last_3_away_avrg_AwayAerials,away_avrg_AwayClearances,last_3_away_avrg_AwayClearances,away_avrg_AwayOffsides,last_3_away_avrg_AwayOffsides,away_avrg_AwayGoalKicks,last_3_away_avrg_AwayGoalKicks,away_avrg_AwayThrowIns,last_3_away_avrg_AwayThrowIns,away_avrg_AwayLongBalls,last_3_away_avrg_AwayLongBalls,home_pnts_lst_5,away_pnts_lst_5,home_pnts_lst_3,away_pnts_lst_3,home_pnts_lst_game,away_pnts_lst_game,América (MG)_HT,Atl Goianiense_HT,Atl Paranaense_HT,Atlético Mineiro_HT,Avaí_HT,Bahia_HT,Botafogo (RJ)_HT,Bragantino_HT,CSA_HT,Ceará_HT,Chapecoense_HT,Corinthians_HT,Coritiba_HT,Cruzeiro_HT,Flamengo_HT,Fluminense_HT,Fortaleza_HT,Goiás_HT,Grêmio_HT,Internacional_HT,Palmeiras_HT,Paraná_HT,Ponte Preta_HT,Santos_HT,Sport Recife_HT,São Paulo_HT,Vasco da Gama_HT,Vitória_HT,América (MG)_AT,Atl Goianiense_AT,Atl Paranaense_AT,Atlético Mineiro_AT,Avaí_AT,Bahia_AT,Botafogo (RJ)_AT,Bragantino_AT,CSA_AT,Ceará_AT,Chapecoense_AT,Corinthians_AT,Coritiba_AT,Cruzeiro_AT,Flamengo_AT,Fluminense_AT,Fortaleza_AT,Goiás_AT,Grêmio_AT,Internacional_AT,Palmeiras_AT,Paraná_AT,Ponte Preta_AT,Santos_AT,Sport Recife_AT,São Paulo_AT,Vasco da Gama_AT,Vitória_AT
0,0.640769,0.570312,0.709957,0.716216,0.449524,0.365385,0.73127,0.643333,0.475132,0.696429,0.385836,0.25641,0.34767,0.380531,0.476005,0.502841,0.330794,0.25,0.199134,0.16,0.678363,0.37931,0.416005,0.191919,0.339683,0.4,0.362637,0.384615,0.581874,0.5,0.397861,0.358491,0.508961,0.544715,0.47619,0.730769,0.533871,0.494792,0.649355,0.606667,0.620584,0.75,0.387097,0.333333,0.357986,0.2875,0.405043,0.615651,0.553763,0.405797,0.545772,0.361111,0.63336,0.434783,0.604364,0.226027,0.491935,0.529412,0.399605,0.292683,0.424965,0.481481,0.552915,0.526316,0.866667,0.266667,0.777778,0.333333,1.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [135]:
# PROBABILITY OF PARTICULAR NUMBERS OF CORNERS
print(model_lr_cor_85.predict_proba(input_df))
print(model_lr_cor_95.predict_proba(input_df))
print(model_lr_cor_105.predict_proba(input_df))
print(model_lr_cor_115.predict_proba(input_df))
print(model_lr_cor_125.predict_proba(input_df))
print(model_lr_cor_135.predict_proba(input_df))

[[0.20859462 0.79140538]]
[[0.6370642 0.3629358]]
[[0.45348121 0.54651879]]
[[0.80451378 0.19548622]]
[[0.83750887 0.16249113]]
[[0.94999212 0.05000788]]


In [136]:
# PROBABILITY OF PARTICULAR NUMBERS OF GOALS
print(model_lr_g_05.predict_proba(input_df))
print(model_lr_g_15.predict_proba(input_df))
print(model_lr_g_25.predict_proba(input_df))
print(model_lr_g_35.predict_proba(input_df))
print(model_lr_g_45.predict_proba(input_df))

[[0.13612771 0.86387229]]
[[0.2830394 0.7169606]]
[[0.47371991 0.52628009]]
[[0.78751869 0.21248131]]
[[0.90083315 0.09916685]]


# Predict Function

In [None]:
import pandas as pd

In [None]:
final_data = pd.read_csv('featured_final_data.csv')

In [None]:
final_data.rename(columns={final_data.columns[0]: 'index'}, inplace=True)

In [None]:
final_data.tail()

In [None]:
data_pred = pd.DataFrame()

In [None]:
# 5 types of information that are going to be extracted using the function below.

# - Average Statistics for all the seasons
# - Average Statistics for the last 3 matches
# Sum of points scored by the team in the last 5 matches
# Sum of points scored by the team in the last 3 matches
# Sum of points scored by the team in the last match.

## Functions to predict previous matches stats

### Average for all seasons

In [None]:
def home_average_season_pred(data, home_team, away_team, variavel):
    
    i = data["index"].shape[0]
    
    media_home = []
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["HT"]==home_team]
    oi = oi[variavel].mean()
    media_home.append(oi)
    data_pred["home_avrg_"+variavel] = media_home
    #data_pred["home_avrg_"+variavel] = pd.DataFrame(media_home)
    
    
def away_average_season_pred(data, home_team, away_team, variavel):
    
    i = data["index"].shape[0]
    
    media_away = []
    
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["AT"]==away_team]
    oi = oi[variavel].mean()
    media_away.append(oi)

    #data_pred["away_avrg_"+variavel] = pd.DataFrame(media_away)
    data_pred["away_avrg_"+variavel] = media_away

### Features for the last 3 matches

In [None]:
def home_average_last_3_pred(data, home_team, away_team, variavel):

    i = data["index"].shape[0]
    media_home = []
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["HT"]==home_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-3:]
    oi = oi[variavel].mean()
    media_home.append(oi)
     
    data_pred["last_3_home_avrg_"+variavel] = pd.DataFrame(media_home)
    
    
def away_average_last_3_pred(data, home_team, away_team, variavel):

    i = data["index"].shape[0]
    media_away = []
    
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["AT"]==away_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-3:]
    oi = oi[variavel].mean()
    media_away.append(oi)

    data_pred["last_3_away_avrg_"+variavel] = pd.DataFrame(media_away)

### Last Results

In [None]:
def sequence_5_pred(data, home_team, away_team):
    '''
    Description: Picks the last 5 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 5 games points    
    '''
    i = data["index"].shape[0]
    sequences_home = []
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["HT"]==home_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-5:]
    oi = oi['points_result_home'].rolling(5).sum()
    sequences_home.append(oi.values[-1:])
    #data_pred["home_pnts_lst_5"] = pd.Series(sequences_home)
    data_pred["home_pnts_lst_5"] = sequences_home[0]
    
    sequences_away = []
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["AT"]==away_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-5:]
    oi = oi['points_result_away'].rolling(5).sum()
    sequences_away.append(oi.values[-1:])
    #data_pred["away_pnts_lst_5"] = pd.Series(sequences_away)
    data_pred["away_pnts_lst_5"] = sequences_away[0]

In [None]:
def sequence_3_pred(data, home_team, away_team):
    '''
    Description: Picks the last 3 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 3 games points    
    '''
    i = data["index"].shape[0]
    sequences_home = []
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["HT"]==home_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-3:]
    oi = oi['points_result_home'].rolling(3).sum()
    sequences_home.append(oi.values[-1:])
    #data_pred["home_pnts_lst_3"] = pd.Series(sequences_home)
    data_pred["home_pnts_lst_3"] = sequences_home[0]
    
    sequences_away = []
    #for i, j in zip(data["Index"],data["AT"]):
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["AT"]==away_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-3:]
    oi = oi['points_result_away'].rolling(3, min_periods=1).sum()
    sequences_away.append(oi.values[-1:])
    #data_pred["away_pnts_lst_3"] = pd.Series(sequences_away)
    data_pred["away_pnts_lst_3"] = sequences_away[0]

In [None]:
def sequence_1_pred(data, home_team, away_team):
    '''
    Description: Picks the last 3 games and calculates how many points the team scored, victory = 3
    loss = 0, draw = 1.
    
    Input:
        - None
    Output: 
        - Sequence of the last 3 games points    
    '''
    i = data["index"].shape[0]
    
    sequences_home = []
    
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["HT"]==home_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-1:]
    oi = oi['points_result_home'].rolling(1).sum()
    sequences_home.append(oi.values[-1:])
    #data_pred["home_pnts_lst_game"] = pd.Series(sequences_home)
    data_pred["home_pnts_lst_game"] = sequences_home[0]
    
    sequences_away = []
    
    oi = data.loc[data["index"]<=i]
    oi = oi.loc[oi["AT"]==away_team]
    oi= oi.reset_index(drop=True)
    oi = oi[-3:]
    oi = oi['points_result_away'].rolling(1).sum()
    sequences_away.append(oi.values[-1:])
    #data_pred["away_pnts_lst_game"] = pd.Series(sequences_away)
    data_pred["away_pnts_lst_game"] = sequences_away[0]

## Run the functions to get data

In [None]:
home_team = 'Flamengo'
away_team = 'São Paulo'

In [None]:
# ONE SINGLE FUNCTION
# Can't run this yet!
# Have to fix the commented part.


home_features = ['Home_Poss', 'Home_PA', 'Home_ShoT', 'Home_Saves', 'HomeFouls', 'HomeCorners', 'HomeCrosses', 'HomeTouches', 
                 'HomeTackles', 'HomeInterceptions', 'HomeAerials', 'HomeClearances', 'HomeOffsides', 'HomeGoalKicks', 
                 'HomeThrowIns', 'HomeLongBalls'] 
away_features = ['Away_Poss', 'Away_PA', 'Away_ShoT', 'Away_Saves', 'AwayFouls', 'AwayCorners', 'AwayCrosses', 'AwayTouches', 
                 'AwayTackles', 'AwayInterceptions', 'AwayAerials', 'AwayClearances', 'AwayOffsides', 'AwayGoalKicks', 
                 'AwayThrowIns', 'AwayLongBalls']

def run_all(home_team, away_team):
    
    for i in home_features:
        home_average_season_pred(data, home_team, away_team, i)
        home_average_last_3_pred(data, home_team, away_team, i)
        
    for i in away_features:
        away_average_season_pred(data, home_team, away_team, i)
        away_average_last_3_pred(data, home_team, away_team, i)
        
    sequence_5_pred(data, home_team, away_team)
    sequence_3_pred(data, home_team, away_team)
    sequence_1_pred(data, home_team, away_team)
    
    # Have to define b and c
    #data_pred = pd.concat([data_pred, result[b], result[c]], axis=1)
    #data_pred.dropna(inplace=True)
    
    teams_encoded(data_pred, home_team, away_team)
    
    #Adjust this part to get info from final_final
    #cols = list(final_final.columns.values)
    
    data_pred = data_pred[cols[:-1]]
    
    return data_pred

In [None]:
home_features = ['Home_Poss', 'Home_PA', 'Home_ShoT', 'Home_Saves', 'HomeFouls', 'HomeCorners', 'HomeCrosses', 'HomeTouches', 
                 'HomeTackles', 'HomeInterceptions', 'HomeAerials', 'HomeClearances', 'HomeOffsides', 'HomeGoalKicks', 
                 'HomeThrowIns', 'HomeLongBalls'] 
away_features = ['Away_Poss', 'Away_PA', 'Away_ShoT', 'Away_Saves', 'AwayFouls', 'AwayCorners', 'AwayCrosses', 'AwayTouches', 
                 'AwayTackles', 'AwayInterceptions', 'AwayAerials', 'AwayClearances', 'AwayOffsides', 'AwayGoalKicks', 
                 'AwayThrowIns', 'AwayLongBalls']

for i in home_features:
    home_average_season_pred(data, home_team, away_team, i)
    home_average_last_3_pred(data, home_team, away_team, i)
    
for i in away_features:
    away_average_season_pred(data, home_team, away_team, i)
    away_average_last_3_pred(data, home_team, away_team, i)

In [None]:
sequence_5_pred(data, home_team, away_team)
sequence_3_pred(data, home_team, away_team)
sequence_1_pred(data, home_team, away_team)

In [None]:
data_pred.head()

In [None]:
data_pred.shape

In [None]:
test = pd.concat([data_pred, result[b], result[c]], axis=1)
test.dropna(inplace=True)

In [None]:
test.head()

In [None]:
test[['América (MG)_HT', 'Atl Goianiense_HT', 'Atl Paranaense_HT', 'Atlético Mineiro_HT', 'Avaí_HT', 'Bahia_HT', 
      'Botafogo (RJ)_HT', 'Bragantino_HT', 'CSA_HT', 'Ceará_HT', 'Chapecoense_HT', 'Corinthians_HT', 'Coritiba_HT',
      'Cruzeiro_HT', 'Flamengo_HT', 'Fluminense_HT', 'Fortaleza_HT', 'Goiás_HT', 'Grêmio_HT', 'Internacional_HT',
      'Palmeiras_HT', 'Paraná_HT', 'Ponte Preta_HT', 'Santos_HT', 'Sport Recife_HT', 'São Paulo_HT', 'Vasco da Gama_HT', 
      'Vitória_HT', 'América (MG)_AT', 'Atl Goianiense_AT', 'Atl Paranaense_AT', 'Atlético Mineiro_AT', 'Avaí_AT', 
      'Bahia_AT', 'Botafogo (RJ)_AT', 'Bragantino_AT', 'CSA_AT', 'Ceará_AT', 'Chapecoense_AT', 'Corinthians_AT', 'Coritiba_AT',
      'Cruzeiro_AT', 'Flamengo_AT', 'Fluminense_AT', 'Fortaleza_AT', 'Goiás_AT', 'Grêmio_AT', 'Internacional_AT', 'Palmeiras_AT',
      'Paraná_AT', 'Ponte Preta_AT', 'Santos_AT', 'Sport Recife_AT', 'São Paulo_AT', 'Vasco da Gama_AT', 'Vitória_AT']] = 0

In [None]:
def teams_encoded(data, home_team, away_team):

    a = f'{home_team}' + '_HT'
    b = f'{away_team}' + '_AT'  
    
    data[[a]] = 1
    data[[b]] = 1

In [None]:
teams_encoded(test, home_team, away_team)

In [None]:
test.head()

In [None]:
cols = list(final_final.columns.values)

In [None]:
# Re-order the test dataframe to be equal to the data

test = test[cols[:-1]]

In [None]:
test.head()

In [None]:
# How to make it work?

from sklearn.preprocessing import MinMaxScaler

norm = MinMaxScaler()
norm_test = norm.fit_transform(test[a])
x = pd.DataFrame(norm_test, columns=a)
test

In [None]:
test.to_csv('data_to_predict.csv')

-------------------------------------------------------------------------------------------

# Predict

In [None]:
knn_model.predict(test)

In [None]:
knn_model.predict_proba(test)

# Tests

In [None]:
data.loc[data["AT"]=='São Paulo']