In [71]:
import pandas as pd
import numpy as np
from datetime import datetime

In [90]:
def dataTransformer(original_dataset,features):
    
    """
    param:
        @original_dataset: (string) name of the csv file
        @features: (list) the features need to be extracted from original dataset
    """
    
    data = pd.DataFrame.from_csv(original_dataset)
    team_lst = np.unique(data['HomeTeam'].tolist() + data['AwayTeam'].tolist())
    team_store = {team:0 for team in team_lst} 
    
    for team in team_lst:
        home_data = data[data['HomeTeam']==team]
        col_name = home_data.columns.tolist()  
        col_name.insert(col_name.index('AwayTeam')+1,'Home_Indicator')  
        home_data = home_data.reindex(columns=col_name)  
        home_data['Home_Indicator'] = 1
        home_data = home_data.rename(columns={'HomeTeam': 'Team', 'AwayTeam': 'Opponent'})
        
        away_data = data[data['AwayTeam']==team]
        col_name_2 = away_data.columns.tolist()  
        col_name_2.insert(col_name_2.index('AwayTeam')+1,'Home_Indicator')  
        away_data = away_data.reindex(columns=col_name_2)  
        away_data['Home_Indicator'] = 0
        away_data = away_data.rename(columns={'AwayTeam': 'Team', 'HomeTeam': 'Opponent'})
        
        concat_data = pd.concat([home_data, away_data])
        team_data = concat_data[features]
        
        rename_dict = {'FTHG':'Full_Home_Goals','FTAG':'Full_Away_Goals','FTR':'Full_Results','HTHG':'Half_Home_Goals','HTAG':'Half_Away_Goals',
                       'HST':'Home_Shots_on_Target','AST':'Away_Shots_on_Target','AS':'Away_Shots','HS':'Home_Shots'}
        team_data = team_data.rename(columns=rename_dict)
        
        team_data['Date'] = [datetime.strptime(x, '%d/%m/%y') for x in team_data['Date']]
        team_data = team_data.sort_values(by='Date')
        
        #compute the standing until the last game
        col_name_3 = team_data.columns.tolist()  
        col_name_3.insert(col_name_3.index('Full_Results')+1,'Standing')  
        team_data = team_data.reindex(columns=col_name_3)

        tmp = []      
        for pair in zip(team_data['Home_Indicator'], team_data['Full_Results']):
            if (pair[0]==0 and pair[1]=='A') or (pair[0]==1 and pair[1]=='H'):
                tmp.append(3)
            elif (pair[0]==1 and pair[1]=='A') or (pair[0]==0 and pair[1]=='H'):
                tmp.append(0)
            else:
                tmp.append(1)

        standing_lst = []
        for i in range(len(tmp)+2):
            if i > 1:
                standing_lst.append(np.sum(tmp[:i-1]))

        team_data['Standing'] = standing_lst
        
        #################################################################################
        
        #compute the winning probability
        col_name_4 = team_data.columns.tolist()  
        col_name_4.insert(col_name_4.index('Standing')+1,'Winning_Probability')  
        team_data = team_data.reindex(columns=col_name_4)
        
        tmp_1 = []
        for pair in zip(team_data['Home_Indicator'], team_data['Full_Results']):
            if (pair[0]==0 and pair[1]=='A') or (pair[0]==1 and pair[1]=='H'):  #win
                tmp_1.append(1)
            elif (pair[0]==1 and pair[1]=='A') or (pair[0]==0 and pair[1]=='H'): #lose
                tmp_1.append(0)
            else: #draw
                tmp_1.append(0)
        
        win_lst = []
        for i in range(len(tmp_1)+2):
            if i > 1:
                win_lst.append(np.sum(tmp_1[:i-1])/(i-1))
        team_data['Winning_Probability']= win_lst
        
        #################################################################################
        
        #compute the winning probability of being home/away team
        home_wprob_lst = []
        away_wprob_lst = []

        for i in range(len(team_data)):
            tmp_data = team_data[:i+1]
            tmp_df = tmp_data[tmp_data['Home_Indicator']==1]
            if len(tmp_df)==0:
                home_wprob_lst.append(0)
            else:
                home_wprob = (len(tmp_df[tmp_df['Full_Results']=='H']))/len(tmp_df)
                home_wprob_lst.append(home_wprob)

            tmp_df1 = tmp_data[tmp_data['Home_Indicator']==0]
            if len(tmp_df1)==0:
                away_wprob_lst.append(0)
            else:
                away_wprob = (len(tmp_df1[tmp_df1['Full_Results']=='A']))/len(tmp_df1)
                away_wprob_lst.append(away_wprob)
        
        col_name = team_data.columns.tolist() 
        col_name.insert(col_name.index('Winning_Probability')+1,'Home_Win_Prob')
        team_data = team_data.reindex(columns=col_name)
        team_data['Home_Win_Prob'] = home_wprob_lst

        col_name = team_data.columns.tolist() 
        col_name.insert(col_name.index('Home_Win_Prob')+1,'Away_Win_Prob')
        team_data = team_data.reindex(columns=col_name)
        team_data['Away_Win_Prob'] = away_wprob_lst
    
        team_store[team] = team_data
        
    return team_store
        

In [91]:
features_lst = ['Date','Team','Opponent','Home_Indicator','FTR','HTHG','HTAG','HST','AST','AS','HS']
Milan = dataTransformer('Italy12.csv',features_lst)['Milan']

In [92]:
a = Milan.iloc[:,1:4].reset_index(drop=True)
b = pd.concat([init_row,Milan.iloc[:-1,4:]]).reset_index(drop=True)
Milan_new = pd.concat([a,b],axis=1)

In [93]:
Milan_new

Unnamed: 0,Team,Opponent,Home_Indicator,Away_Shots,Away_Shots_on_Target,Away_Win_Prob,Full_Away_Goals,Full_Home_Goals,Full_Results,Half_Away_Goals,Half_Home_Goals,Home_Shots,Home_Shots_on_Target,Home_Win_Prob,Standing,Winning_Probability
0,Milan,Sampdoria,1,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0
1,Milan,Bologna,0,10,3,0.0,,,A,0,0,23,6,0.0,0,0.0
2,Milan,Atalanta,1,17,6,1.0,,,A,1,1,15,3,0.0,3,0.5
3,Milan,Udinese,0,10,4,1.0,,,A,0,0,17,7,0.0,3,0.333333
4,Milan,Cagliari,1,14,4,0.5,,,H,0,1,15,8,0.0,3,0.25
5,Milan,Parma,0,19,2,0.5,,,H,0,1,17,3,0.333333,6,0.4
6,Milan,Inter,1,17,9,0.333333,,,D,0,0,11,2,0.333333,7,0.333333
7,Milan,Lazio,0,7,3,0.333333,,,A,1,0,20,5,0.25,7,0.285714
8,Milan,Genoa,1,12,6,0.25,,,H,0,2,11,8,0.25,7,0.25
9,Milan,Palermo,0,7,3,0.25,,,H,0,0,12,2,0.4,10,0.333333
