In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict
import scipy.sparse as sp
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import brier_score_loss

In [2]:
conn_string = 'mysql://{user}:{password}@{host}:{port}/'.format(
    user='root', password='dwdstudent2015', 
    host = '34.225.180.235', port=3306)
engine = create_engine(conn_string, encoding='utf8')

engine.execute('USE ml_football')

<sqlalchemy.engine.result.ResultProxy at 0x7f0ac2803470>

In [113]:
# all seasons for England
df = pd.read_sql('select Date, HomeTeam, AwayTeam, FTR, FTHG, FTAG, HTHG, HTAG, HST, AST, `AS`, `HS` from England', con = engine)

year = []
month = []
results = []
for i in range(df.shape[0]):
    datetime = df['Date'][i].to_pydatetime()
    year.append(datetime.year)
    month.append(datetime.month)
    if df['FTR'][i] == 'D':
        results.append(0)
    elif df['FTR'][i] == 'A':
        results.append(1)
    else:
        results.append(2)
#     df['Date'][i].to_string()
df['Date'] = df['Date'].dt.strftime('%d/%m/%y')
df['results'] = results
df['year'] = year
df['month'] = month
df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,HTHG,HTAG,HST,AST,AS,HS,results,year,month
0,18/08/12,Arsenal,Sunderland,D,0.0,0.0,0.0,0.0,4.0,2.0,3.0,14.0,0,2012,8
1,18/08/12,Fulham,Norwich,H,5.0,0.0,2.0,0.0,9.0,2.0,4.0,11.0,2,2012,8
2,18/08/12,Newcastle,Tottenham,H,2.0,1.0,0.0,0.0,4.0,6.0,12.0,6.0,2,2012,8
3,18/08/12,QPR,Swansea,A,0.0,5.0,0.0,1.0,11.0,8.0,12.0,20.0,1,2012,8
4,18/08/12,Reading,Stoke,D,1.0,1.0,0.0,1.0,3.0,3.0,6.0,9.0,0,2012,8


In [114]:
# season 2012 for England
df1 = df.loc[df['year'] == 2012].loc[df['month'] > 5]
df2 = df.loc[df['year'] == 2013].loc[df['month'] < 6]
df_eng12 = pd.concat([df1,df2])
df_eng12.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,FTHG,FTAG,HTHG,HTAG,HST,AST,AS,HS,results,year,month
0,18/08/12,Arsenal,Sunderland,D,0.0,0.0,0.0,0.0,4.0,2.0,3.0,14.0,0,2012,8
1,18/08/12,Fulham,Norwich,H,5.0,0.0,2.0,0.0,9.0,2.0,4.0,11.0,2,2012,8
2,18/08/12,Newcastle,Tottenham,H,2.0,1.0,0.0,0.0,4.0,6.0,12.0,6.0,2,2012,8
3,18/08/12,QPR,Swansea,A,0.0,5.0,0.0,1.0,11.0,8.0,12.0,20.0,1,2012,8
4,18/08/12,Reading,Stoke,D,1.0,1.0,0.0,1.0,3.0,3.0,6.0,9.0,0,2012,8


In [133]:
def dataTransformer(data,features):
    
    """
    param:
        @original_dataset: (string) name of the csv file
        @features: (list) the features need to be extracted from original dataset
    """
    
#     data = pd.DataFrame.from_csv(original_dataset)
    team_lst = np.unique(data['HomeTeam'].tolist() + data['AwayTeam'].tolist())
    team_store = {team:0 for team in team_lst} 
    
    for team in team_lst:
        home_data = data[data['HomeTeam']==team]
        col_name = home_data.columns.tolist()  
        col_name.insert(col_name.index('AwayTeam')+1,'Home_Indicator')  
        home_data = home_data.reindex(columns=col_name)  
        home_data['Home_Indicator'] = 1
        home_data = home_data.rename(columns={'HomeTeam': 'Team', 'AwayTeam': 'Opponent'})
        
        away_data = data[data['AwayTeam']==team]
        col_name_2 = away_data.columns.tolist()  
        col_name_2.insert(col_name_2.index('AwayTeam')+1,'Home_Indicator')  
        away_data = away_data.reindex(columns=col_name_2)  
        away_data['Home_Indicator'] = 0
        away_data = away_data.rename(columns={'AwayTeam': 'Team', 'HomeTeam': 'Opponent'})
        
        concat_data = pd.concat([home_data, away_data])
        team_data = concat_data[features]
        
#         rename_dict = {'FTHG':'Full_Home_Goals','FTAG':'Full_Away_Goals','FTR':'Full_Results','HTHG':'Half_Home_Goals','HTAG':'Half_Away_Goals',
#                        'HST':'Home_Shots_on_Target','AST':'Away_Shots_on_Target','AS':'Away_Shots','HS':'Home_Shots'}
        rename_dict = {'FTR':'Full_Results'}
        team_data = team_data.rename(columns=rename_dict)
        
        team_data['Date'] = [datetime.strptime(x, '%d/%m/%y') for x in team_data['Date']]
        team_data = team_data.sort_values(by='Date')
        
        #compute the standing until the last game
        col_name_3 = team_data.columns.tolist()  
        col_name_3.insert(col_name_3.index('Full_Results')+1,'Standing')  
        team_data = team_data.reindex(columns=col_name_3)

        tmp = []      
        for pair in zip(team_data['Home_Indicator'], team_data['Full_Results']):
            if (pair[0]==0 and pair[1]=='A') or (pair[0]==1 and pair[1]=='H'):
                tmp.append(3)
            elif (pair[0]==1 and pair[1]=='A') or (pair[0]==0 and pair[1]=='H'):
                tmp.append(0)
            else:
                tmp.append(1)

        standing_lst = []
        for i in range(len(tmp)+2):
            if i > 1:
                standing_lst.append(np.sum(tmp[:i-1]))

        team_data['Standing'] = standing_lst
        
        #################################################################################
        # goals, shots, shots_on_target last 2-5 games
        for k in [5,4,3,2]:
            col_name_4 = team_data.columns.tolist()  
            col_name_4.insert(col_name_4.index('FTHG')+1,'FTHG_last'+str(k)+'_avg')
            col_name_4.insert(col_name_4.index('FTAG')+1,'FTAG_last'+str(k)+'_avg')
            col_name_4.insert(col_name_4.index('HTHG')+1,'HTHG_last'+str(k)+'_avg')
            col_name_4.insert(col_name_4.index('HTAG')+1,'HTAG_last'+str(k)+'_avg')
            col_name_4.insert(col_name_4.index('HST')+1,'HST_last'+str(k)+'_avg')
            col_name_4.insert(col_name_4.index('AST')+1,'AST_last'+str(k)+'_avg')
            team_data = team_data.reindex(columns=col_name_4)
            fthg = []
            ftag = []
            hthg = []
            htag = []
            hst = []
            ast = []
            for i in range(len(tmp)):
                if i - k + 1 < 0:
                    fthg.append(np.sum(team_data['FTHG'][0:i + 1])/ (i + 1))
                    ftag.append(np.sum(team_data['FTAG'][0:i + 1])/ (i + 1))
                    hthg.append(np.sum(team_data['HTHG'][0:i + 1])/ (i + 1))
                    htag.append(np.sum(team_data['HTAG'][0:i + 1])/ (i + 1))
                    hst.append(np.sum(team_data['HST'][0:i + 1])/ (i + 1))
                    ast.append(np.sum(team_data['AST'][0:i + 1])/ (i + 1))
                else:   
                    fthg.append(np.sum(team_data['FTHG'][i - k + 1:i + 1])/k)
                    ftag.append(np.sum(team_data['FTAG'][i - k + 1:i + 1])/k)
                    hthg.append(np.sum(team_data['HTHG'][i - k + 1:i + 1])/k)
                    htag.append(np.sum(team_data['HTAG'][i - k + 1:i + 1])/k)
                    hst.append(np.sum(team_data['HST'][i - k + 1:i + 1])/k)
                    ast.append(np.sum(team_data['AST'][i - k + 1:i + 1])/k)
            team_data['FTHG_last'+str(k)+'_avg']= fthg
            team_data['FTAG_last'+str(k)+'_avg']= ftag
            team_data['HTHG_last'+str(k)+'_avg']= hthg
            team_data['HTAG_last'+str(k)+'_avg']= htag
            team_data['HST_last'+str(k)+'_avg']= hst
            team_data['AST_last'+str(k)+'_avg']= ast
        #################################################################################
        
        #compute the winning probability
        col_name_4 = team_data.columns.tolist()  
        col_name_4.insert(col_name_4.index('Home_Indicator')+1,'Winning_Probability')
        team_data = team_data.reindex(columns=col_name_4)
        
        col_name_4 = team_data.columns.tolist()
        col_name_4.insert(col_name_4.index('Winning_Probability')+1,'Winning_Probability_last5')
        team_data = team_data.reindex(columns=col_name_4)
        
        tmp_1 = []
        for pair in zip(team_data['Home_Indicator'], team_data['Full_Results']):
            if (pair[0]==0 and pair[1]=='A') or (pair[0]==1 and pair[1]=='H'):  #win
                tmp_1.append(1)
            elif (pair[0]==1 and pair[1]=='A') or (pair[0]==0 and pair[1]=='H'): #lose
                tmp_1.append(0)
            else: #draw
                tmp_1.append(0)
        
        win_lst = []
        win_last5 = []
#         win_last5 = [0,0,0,0,0]
        for i in range(len(tmp_1)):
            win_lst.append(np.sum(tmp_1[:i + 1])/(i + 1))
        for i in range(len(tmp_1)):
            if i > 4:
                win_last5.append(np.sum(tmp_1[i-4:i + 1])/5)
            else:
                win_last5.append(np.sum(tmp_1[0 : i + 1])/ (i + 1))
        team_data['Winning_Probability']= win_lst
        team_data['Winning_Probability_last5'] = win_last5
        
        #################################################################################
        
        #compute the winning probability of being home/away team
        home_wprob_lst = []
        away_wprob_lst = []
        
        for i in range(len(team_data)):
            tmp_data = team_data[:i+1]
            tmp_df = tmp_data[tmp_data['Home_Indicator']==1]
            if len(tmp_df)==0:
                home_wprob_lst.append(0)
            else:
                home_wprob = (len(tmp_df[tmp_df['Full_Results']=='H']))/len(tmp_df)
                home_wprob_lst.append(home_wprob)

            tmp_df1 = tmp_data[tmp_data['Home_Indicator']==0]
            if len(tmp_df1)==0:
                away_wprob_lst.append(0)
            else:
                away_wprob = (len(tmp_df1[tmp_df1['Full_Results']=='A']))/len(tmp_df1)
                away_wprob_lst.append(away_wprob)
        
        col_name = team_data.columns.tolist() 
        col_name.insert(col_name.index('Winning_Probability_last5')+1,'Home_Win_Prob')
        team_data = team_data.reindex(columns=col_name)
        team_data['Home_Win_Prob'] = home_wprob_lst

        col_name = team_data.columns.tolist() 
        col_name.insert(col_name.index('Home_Win_Prob')+1,'Away_Win_Prob')
        team_data = team_data.reindex(columns=col_name)
        team_data['Away_Win_Prob'] = away_wprob_lst
        
        rename_dict = {'FTHG':'Full_Home_Goals','FTAG':'Full_Away_Goals','HTHG':'Half_Home_Goals','HTAG':'Half_Away_Goals',
                       'HST':'Home_Shots_on_Target','AST':'Away_Shots_on_Target','AS':'Away_Shots','HS':'Home_Shots'}
        team_data = team_data.rename(columns=rename_dict)
        
        team_store[team] = team_data
        
        
        
    return team_store

In [135]:
features_lst = ['Date','Team','Opponent', 'FTR', 'Home_Indicator','FTHG', 'FTAG', 'HTHG','HTAG','HST','AST','AS','HS']
dict_eng12 = dataTransformer(df_eng12, features_lst)
dict_eng12['Chelsea']

Unnamed: 0,Date,Team,Opponent,Full_Results,Standing,Home_Indicator,Winning_Probability,Winning_Probability_last5,Home_Win_Prob,Away_Win_Prob,...,HST_last3_avg,HST_last4_avg,HST_last5_avg,Away_Shots_on_Target,AST_last2_avg,AST_last3_avg,AST_last4_avg,AST_last5_avg,Away_Shots,Home_Shots
8,2012-08-19,Chelsea,Wigan,A,3,0,1.0,1.0,0.0,1.0,...,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,5.0,12.0
10,2012-08-22,Chelsea,Reading,H,6,1,1.0,1.0,1.0,1.0,...,7.5,7.5,7.5,5.0,4.0,4.0,4.0,4.0,7.0,23.0
12,2012-08-25,Chelsea,Newcastle,H,9,1,1.0,1.0,1.0,1.0,...,7.0,7.0,7.0,5.0,5.0,4.333333,4.333333,4.333333,11.0,11.0
34,2012-09-15,Chelsea,QPR,D,10,0,0.75,0.75,1.0,0.5,...,7.666667,6.75,6.75,9.0,7.0,6.333333,5.5,5.5,13.0,10.0
39,2012-09-22,Chelsea,Stoke,H,13,1,0.8,0.8,1.0,0.5,...,7.0,8.0,7.2,6.0,7.5,6.666667,6.25,5.6,13.0,17.0
49,2012-09-29,Chelsea,Arsenal,A,16,0,0.833333,0.8,1.0,0.666667,...,6.666667,6.5,7.4,5.0,5.5,6.666667,6.25,6.0,10.0,14.0
59,2012-10-06,Chelsea,Norwich,H,19,1,0.857143,0.8,1.0,0.666667,...,9.0,8.25,7.8,4.0,4.5,5.0,6.0,5.8,7.0,20.0
74,2012-10-20,Chelsea,Tottenham,A,22,0,0.875,0.8,1.0,0.75,...,11.333333,10.75,9.8,7.0,5.5,5.333333,5.5,6.2,11.0,23.0
85,2012-10-28,Chelsea,Man United,A,22,1,0.777778,0.8,0.8,0.75,...,13.333333,11.25,10.8,7.0,7.0,6.0,5.75,5.8,15.0,14.0
93,2012-11-03,Chelsea,Swansea,D,23,0,0.7,0.6,0.8,0.6,...,11.333333,11.75,10.4,7.0,7.0,7.0,6.25,6.0,11.0,9.0


In [186]:
df = pd.DataFrame()
i = 0
for k in dict_eng12.keys():
    i += 1
    tmp = dict_eng12[k]
    a = tmp.iloc[:, 1:4].reset_index(drop = True)
    init_row = pd.DataFrame(columns=tmp.columns[4:]).reset_index(drop=True)
    init_row.loc[0] = [0]*len(tmp.columns[4:])
    b = pd.concat([init_row,tmp.iloc[:-1,4:]]).reset_index(drop=True)
    new = pd.concat([a, b],axis = 1)
    df = pd.concat([df, new], axis = 0)    
df.reset_index(drop = True)
df.to_csv('eng12.csv')
# df.loc[df['Team'] == 'Arsenal']
df

Unnamed: 0,Team,Opponent,Full_Results,Standing,Home_Indicator,Winning_Probability,Winning_Probability_last5,Home_Win_Prob,Away_Win_Prob,Full_Home_Goals,...,HST_last3_avg,HST_last4_avg,HST_last5_avg,Away_Shots_on_Target,AST_last2_avg,AST_last3_avg,AST_last4_avg,AST_last5_avg,Away_Shots,Home_Shots
0,QPR,Swansea,A,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,QPR,Norwich,D,0,1,0,0,0,0,0,...,11,11,11,8,8,8,8,8,12,20
2,QPR,Man City,H,1,0,0,0,0,0,1,...,7.5,7.5,7.5,4,6,6,6,6,6,13
3,QPR,Chelsea,D,1,0,0,0,0,0,3,...,9,9,9,5,4.5,5.66667,5.66667,5.66667,9,19
4,QPR,Tottenham,H,2,1,0,0,0,0,0,...,7.33333,8.25,8.25,9,7,6,6.5,6.5,13,10
5,QPR,West Ham,A,2,0,0,0,0,0,2,...,10.6667,9,9.4,6,7.5,6.66667,6,6.4,9,18
6,QPR,West Brom,H,2,1,0,0,0,0,1,...,9.66667,10.25,9,10,8,8.33333,7.5,6.8,17,13
7,QPR,Everton,D,2,0,0,0,0,0,3,...,10.3333,9.25,9.8,6,8,7.33333,7.75,7.2,13,13
8,QPR,Arsenal,H,3,1,0,0,0,0,1,...,7.66667,9.25,8.6,4,5,6.66667,6.5,7,10,14
9,QPR,Reading,D,3,0,0,0,0,0,1,...,8.33333,8.5,9.6,3,3.5,4.33333,5.75,5.8,4,21


In [179]:
df = df.loc[df['Full_Results'] != 0]
indicator = df["Home_Indicator"].tolist()
result = df["Full_Results"].tolist()
# 0 draw 1 win 2 loss
y = []
for i in range(len(result)):
    if result[i] == 'D':
        y.append(0)
    elif indicator[i] == 1:
        if result[i] == 'H':
            y.append(1)
        else:
            y.append(2)
    elif indicator[i] == 0:
        if result[i] == 'A':
            y.append(1)
        else:
            y.append(2)
X = df.iloc[:, 3:].as_matrix()
c_max = X.max(axis = 0)
c_min = X.min(axis = 0)
X = (X - c_min) / (c_max - c_min)

X_train = X[:-20, :]
X_test = X[-20:, :]
y_train = y[:-20]
y_test = y[-20:]

In [213]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(10,),activation='relu')
for k in range(10):
    mlp.fit(X_train,y_train)
    y_prob = mlp.predict_proba(X_test)
    y_pred = []
    for prob in y_prob:
        idx = list(prob).index(max(prob))
        y_pred.append(idx)
    print(y_pred)
    print(y_test)
    num_error = 0
    for i in range(len(y_pred)):
        if y_pred[i] != y_test[i]:
            num_error += 1
    print(num_error / len(y_pred))

[2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 2, 1, 0, 0, 2, 2, 2]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.45
[2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 0, 1, 0, 1, 2, 2, 2, 1]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.6
[2, 1, 1, 1, 0, 2, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.5
[2, 0, 0, 0, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.55
[2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.35
[2, 1, 0, 2, 0, 2, 1, 1, 2, 2, 0, 2, 1, 2, 1, 0, 2, 2, 2, 1]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.65
[2, 1, 2, 1, 2, 2, 1, 1, 1, 0, 2, 2, 0, 1, 1, 0, 1, 2, 2, 2]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 2, 2]
0.6
[0, 1, 2, 0, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1]
[2, 0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2