In [1]:
# import basic packages and load the data

import numpy as np
import pandas as pd

tour_comp = pd.read_csv("data/TourneyCompactResults.csv")
seeds = pd.read_csv("data/TourneySeeds.csv")

In [2]:
tour_comp.tail()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
2045,2016,146,1314,88,1323,74,N,0
2046,2016,146,1393,68,1438,62,N,0
2047,2016,152,1314,83,1393,66,N,0
2048,2016,152,1437,95,1328,51,N,0
2049,2016,154,1437,77,1314,74,N,0


In [3]:
seeds.tail()

Unnamed: 0,Season,Seed,Team
2145,2017,Z12,1292
2146,2017,Z13,1457
2147,2017,Z14,1245
2148,2017,Z15,1297
2149,2017,Z16,1411


In [4]:
# we can see that we have the seeds information for 2017 season, but no tour results
# since we are going to use seeds difference here as a feature, we need only the numbers
seeds.shape

(2150, 3)

In [5]:
# reset the seeds information with only numbers
for i in range(2150):
    seeds.Seed[i] = int(seeds.Seed[i][1:3])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [6]:
seeds.tail()

Unnamed: 0,Season,Seed,Team
2145,2017,12,1292
2146,2017,13,1457
2147,2017,14,1245
2148,2017,15,1297
2149,2017,16,1411


In [7]:
# 2. munipulate the tour results table, let the team with smaller id be the first team
tour_comp.shape

(2050, 8)

In [8]:
tour_comp = tour_comp.rename(columns={"Wteam":"team1", "Lteam":"team2"})

In [9]:
# add a column of game results
tour_comp["result"] = np.repeat(1,2050)

In [10]:
for i in range(2050):
    if tour_comp.team1[i] > tour_comp.team2[i]:       
        team1 = tour_comp.team2[i]
        team2 = tour_comp.team1[i]
        tour_comp.set_value(i, "team1", team1)
        tour_comp.set_value(i, "team2", team2)
        tour_comp.set_value(i, "result", 0)

In [11]:
tour_comp.tail()

Unnamed: 0,Season,Daynum,team1,Wscore,team2,Lscore,Wloc,Numot,result
2045,2016,146,1314,88,1323,74,N,0,1
2046,2016,146,1393,68,1438,62,N,0,1
2047,2016,152,1314,83,1393,66,N,0,1
2048,2016,152,1328,95,1437,51,N,0,0
2049,2016,154,1314,77,1437,74,N,0,0


In [12]:
# 3. merge the two tables to get the seed information of both teams
data = tour_comp[[0,2,4,8]]
data.tail()

Unnamed: 0,Season,team1,team2,result
2045,2016,1314,1323,1
2046,2016,1393,1438,1
2047,2016,1314,1393,1
2048,2016,1328,1437,0
2049,2016,1314,1437,0


In [13]:
data = pd.merge(data, seeds, how='left', left_on=["Season","team1"], right_on=["Season","Team"])
data = pd.merge(data, seeds, how='left', left_on=["Season","team2"], right_on=["Season","Team"])

In [14]:
data.tail()

Unnamed: 0,Season,team1,team2,result,Seed_x,Team_x,Seed_y,Team_y
2045,2016,1314,1323,1,1,1314,6,1323
2046,2016,1393,1438,1,10,1393,1,1438
2047,2016,1314,1393,1,1,1314,10,1393
2048,2016,1328,1437,0,2,1328,2,1437
2049,2016,1314,1437,0,1,1314,2,1437


In [15]:
# get the seed_diff
data["seed_diff"] = np.repeat(0,2050)
for i in range(2050):
    data.set_value(i,"seed_diff",data.Seed_x[i] - data.Seed_y[i]) 

In [16]:
data.tail()

Unnamed: 0,Season,team1,team2,result,Seed_x,Team_x,Seed_y,Team_y,seed_diff
2045,2016,1314,1323,1,1,1314,6,1323,-5
2046,2016,1393,1438,1,10,1393,1,1438,9
2047,2016,1314,1393,1,1,1314,10,1393,-9
2048,2016,1328,1437,0,2,1328,2,1437,0
2049,2016,1314,1437,0,1,1314,2,1437,-1


In [17]:
# drop some columns
data0 = data[[0,1,2,3,8]]
data0.tail()

Unnamed: 0,Season,team1,team2,result,seed_diff
2045,2016,1314,1323,1,-5
2046,2016,1393,1438,1,9
2047,2016,1314,1393,1,-9
2048,2016,1328,1437,0,0
2049,2016,1314,1437,0,-1


In [18]:
# finally we are ready to make some prediction
# import packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data0.seed_diff, data0.result, random_state=0)

In [20]:
x_train = x_train.reshape(-1,1)
x_test = x_test.reshape(-1,1)
log = LogisticRegression().fit(x_train, y_train)
print(log.score(x_test,y_test))

0.717348927875


  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [21]:
# the accuracy sames to be good, see p and r
pred = log.predict(x_test)
confusion = confusion_matrix(y_test,pred)
print(confusion);
print(precision_score(y_test,pred));
print(recall_score(y_test,pred))

[[191  81]
 [ 64 177]]
0.686046511628
0.734439834025


In [22]:
# cross validation
logreg = LogisticRegression()
scores = cross_val_score(logreg, data0.seed_diff.reshape(-1,1), data0.result, cv=5)
print(scores)

[ 0.7080292   0.76097561  0.67560976  0.72926829  0.68215159]


  app.launch_new_instance()


In [23]:
from sklearn.metrics import log_loss
log = LogisticRegression()
pred_prob = log.fit(x_train, y_train).predict_proba(x_test)
log_loss(y_test,pred_prob)

0.55587855992964441