In [0]:
import pandas as pd
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from IPython.display import display

In [2]:
data = pd.read_csv('http://football-data.co.uk/new/BRA.csv')

display(data.head())

Unnamed: 0,Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA
0,Brazil,Serie A,2012,19/05/2012,22:30,Palmeiras,Portuguesa,1.0,1.0,D,1.75,3.86,5.25,1.76,3.87,5.31,1.69,3.5,4.9
1,Brazil,Serie A,2012,19/05/2012,22:30,Sport Recife,Flamengo RJ,1.0,1.0,D,2.83,3.39,2.68,2.83,3.42,2.7,2.59,3.23,2.58
2,Brazil,Serie A,2012,20/05/2012,01:00,Figueirense,Nautico,2.0,1.0,H,1.6,4.04,6.72,1.67,4.05,7.22,1.59,3.67,5.64
3,Brazil,Serie A,2012,20/05/2012,20:00,Botafogo RJ,Sao Paulo,4.0,2.0,H,2.49,3.35,3.15,2.49,3.39,3.15,2.35,3.26,2.84
4,Brazil,Serie A,2012,20/05/2012,20:00,Corinthians,Fluminense,0.0,1.0,A,1.96,3.53,4.41,1.96,3.53,4.41,1.89,3.33,3.89


DATA EXPLORATION

In [10]:
n_matches = data.shape[0]

n_features = data.shape[1] - 1

n_homewins = len(data[data.Res == 'H'])

win_rate = (float(n_homewins) / (n_matches)) * 100

print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by home team: {}".format(n_homewins))
print("Win rate of home team: {:.2f}%".format(win_rate))

Total number of matches: 2878
Number of features: 18
Number of matches won by home team: 1447
Win rate of home team: 50.28%


In [0]:
from pandas.plotting import scatter_matrix

scatter_matrix(data[['PH','PD','PA','MaxH','MaxD','MaxA','AvgH','AvgD','AvgA']], figsize=(10,10))

Preparing Data


In [0]:
x_all = data.drop(['Res'],1)
y_all = data['Res']

from sklearn.preprocessing import scale

cols = [['PH','PD','PA','MaxH','MaxD','MaxA','AvgH','AvgD','AvgA']]
for col in cols:
  x_all[col] = scale(x_all[col])

In [30]:
x_all.AvgH = x_all.AvgH.astype('str')
#x_all.HM2 = x_all.HM2.astype('str')
#x_all.HM3 = x_all.HM3.astype('str')
x_all.AvgA = x_all.AvgA.astype('str')
#x_all.AM2 = x_all.AM2.astype('str')
#x_all.AM3 = x_all.AM3.astype('str')

def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

x_all = preprocess_features(x_all)
print("Processed feature columns ({} total features):\n{}".format(len(x_all.columns), list(x_all.columns)))

print("\nFeature values:")
display(X_all.head())

Processed feature columns (1916 total features):
['Country_Brazil', 'League_Serie A', 'Season', 'Date_01/05/2018', 'Date_01/05/2019', 'Date_01/06/2013', 'Date_01/06/2014', 'Date_01/06/2016', 'Date_01/06/2018', 'Date_01/06/2019', 'Date_01/07/2012', 'Date_01/07/2015', 'Date_01/07/2016', 'Date_01/07/2017', 'Date_01/08/2013', 'Date_01/08/2015', 'Date_01/08/2017', 'Date_01/09/2012', 'Date_01/09/2013', 'Date_01/09/2018', 'Date_01/09/2019', 'Date_01/10/2013', 'Date_01/10/2016', 'Date_01/10/2017', 'Date_01/10/2019', 'Date_01/11/2014', 'Date_01/11/2015', 'Date_01/12/2012', 'Date_01/12/2013', 'Date_01/12/2018', 'Date_02/05/2019', 'Date_02/06/2013', 'Date_02/06/2016', 'Date_02/06/2018', 'Date_02/06/2019', 'Date_02/07/2015', 'Date_02/07/2016', 'Date_02/07/2017', 'Date_02/08/2013', 'Date_02/08/2014', 'Date_02/08/2015', 'Date_02/08/2017', 'Date_02/09/2012', 'Date_02/09/2015', 'Date_02/09/2017', 'Date_02/09/2018', 'Date_02/10/2013', 'Date_02/10/2014', 'Date_02/10/2016', 'Date_02/10/2018', 'Date_02/11

Unnamed: 0,Country_Brazil,League_Serie A,Season,Date_01/05/2018,Date_01/05/2019,Date_01/06/2013,Date_01/06/2014,Date_01/06/2016,Date_01/06/2018,Date_01/06/2019,Date_01/07/2012,Date_01/07/2015,Date_01/07/2016,Date_01/07/2017,Date_01/08/2013,Date_01/08/2015,Date_01/08/2017,Date_01/09/2012,Date_01/09/2013,Date_01/09/2018,Date_01/09/2019,Date_01/10/2013,Date_01/10/2016,Date_01/10/2017,Date_01/10/2019,Date_01/11/2014,Date_01/11/2015,Date_01/12/2012,Date_01/12/2013,Date_01/12/2018,Date_02/05/2019,Date_02/06/2013,Date_02/06/2016,Date_02/06/2018,Date_02/06/2019,Date_02/07/2015,Date_02/07/2016,Date_02/07/2017,Date_02/08/2013,Date_02/08/2014,...,AvgA_2.9915338142874055,AvgA_3.0113649072781503,AvgA_3.046069320011954,AvgA_3.0857315059934436,AvgA_3.105562598984189,AvgA_3.110520372231875,AvgA_3.125393691974933,AvgA_3.1700136512041093,AvgA_3.1898447441948545,AvgA_3.199760290690227,AvgA_3.204718063937913,AvgA_3.2245491569286573,AvgA_3.2443802499194025,AvgA_3.31378907538701,AvgA_3.3980712205976755,AvgA_3.4228600868361063,AvgA_3.4278178600837923,AvgA_3.4476489530745376,AvgA_3.502184458799086,AvgA_3.5220155517898313,AvgA_3.5319310982852032,AvgA_3.606297697000497,AvgA_3.6410021097343006,AvgA_3.660833202725045,AvgA_3.665790975972731,AvgA_3.7351998014403383,AvgA_3.9335107313477873,AvgA_4.017792876558453,AvgA_4.057455062539943,AvgA_4.19131494022747,AvgA_4.285512631933508,AvgA_4.404499189877978,AvgA_4.419372509621036,AvgA_4.612725666280799,AvgA_4.78128995670213,AvgA_4.7961632764451885,AvgA_4.934980927380403,AvgA_5.143207403783224,AvgA_5.158080723526282,AvgA_5.584449222827298
0,1,1,2012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,2012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,2012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,2012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,2012,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_all,y_all,test_size = 50,random_state = 2,stratify = y_all)

ValueError: ignored

In [0]:
from time import time 


from sklearn.metrics import f1_score

def train_classifier(clf, x_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(x_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, x_train, y_train, x_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(x_train)))
    
    # Train the classifier
    train_classifier(clf, x_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, x_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, x_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [40]:

clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')

clf_C = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, x_train, y_train, x_test, y_test)
print('')
train_predict(clf_B, x_train, y_train, x_test, y_test)
print('')
train_predict(clf_C, x_train, y_train, x_test, y_test)
print('')

NameError: ignored

In [0]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer


parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

clf = xgb.XGBClassifier(seed=2)

f1_scorer = make_scorer(f1_score,pos_label='H')

grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

grid_obj = grid_obj.fit(X_train,y_train)

clf = grid_obj.best_estimator_
print clf

f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))