In [4]:
import numpy as np # calculations with arrays
import pandas as pd # user-friendly DataFrames for data representation
import sklearn # machine learning algorithms
from sklearn import ensemble, linear_model
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt # import plot functions
# necessary to plot in jupyter notebook:
%matplotlib inline
import seaborn as sns # make plots beautiful

In [5]:
train = pd.read_csv('competition_zero/train2.csv')
test = pd.read_csv('competition_zero/test2.csv')
sample_submission = pd.read_csv('competition_zero/sample_submission.csv')

In [6]:
# print first row
train[:10]

Unnamed: 0,year,day,team1,team2,score1,score2,target
0,2998,19,317,131,336,278,True
1,2998,28,61,29,301,259,True
2,2998,28,110,141,359,267,True
3,2998,28,352,146,309,410,False
4,2998,28,229,91,332,220,True
5,2998,28,164,238,236,278,False
6,2998,28,184,243,181,224,False
7,2998,28,245,23,216,185,True
8,2998,28,300,349,402,321,True
9,2998,30,61,110,259,325,False


In [7]:
for c in train.columns:
    print c, train[c].unique()[:15]

year [2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012]
day [19 28 30 31 33 35 37 38 40 42 44 46 47 49 51]
team1 [317  61 110 352 229 164 184 245 300  10  39 205  47 180  53]
team2 [131  29 141 146  91 238 243  23 349 110 245 310  69  43 112]
score1 [336 301 359 309 332 236 181 216 402 259 294 220 282 317 406]
score2 [278 259 267 410 220 224 185 321 325 178 232 352 398 328 297]
target [True False]


In [8]:
# train size
train.shape[0]

101609

In [6]:
?ShuffleSplit()

Object `ShuffleSplit` not found.


In [7]:
from sklearn.cross_validation import ShuffleSplit

for itr, ite in ShuffleSplit(len(train), n_iter=1, train_size= 0.999, test_size=0.0001, random_state=0):
    pass

In [8]:
len(itr), len(ite)

(101507, 11)

In [9]:
features = []
for c in train.columns:
    if c in test.columns and c!='target':
        features += [c]
        print '"{}" is present in test and train'.format(c)
    else:
        print '"{}" is NOT present in test'.format(c)
        
features

"year" is present in test and train
"day" is NOT present in test
"team1" is present in test and train
"team2" is present in test and train
"score1" is NOT present in test
"score2" is NOT present in test
"target" is NOT present in test


['year', 'team1', 'team2']

In [10]:
xtrain = train.loc[itr, features]
ytrain = train.loc[itr, 'target']

xval = train.loc[ite, features]
yval = train.loc[ite, 'target']

In [11]:
xtrain[:3]

Unnamed: 0,year,team1,team2
3613,2998,243,363
90593,3017,261,361
3885,2998,117,363


In [12]:
ytrain[:3]

3613     False
90593     True
3885     False
Name: target, dtype: bool

### КОНСТАНТНОЕ ПРЕДСКАЗАНИЕ

In [13]:
ytrain.mean()

0.50101963411390349

In [14]:
constant_prediction = yval * 0 + train.target.mean()

In [15]:
constant_prediction = constant_prediction.values
constant_prediction


array([ 0.5009694,  0.5009694,  0.5009694,  0.5009694,  0.5009694,
        0.5009694,  0.5009694,  0.5009694,  0.5009694,  0.5009694,
        0.5009694])

In [16]:
log_loss(yval, constant_prediction)

0.69262029448261242

In [17]:
submission = sample_submission.copy()
submission.target = train['target'].mean() # notice here that we can refer to a column 'target' in two ways
submission.to_csv('constant_submission.csv', index=False)

### ЛИНЕЙНОЕ

In [18]:
alg = linear_model.LogisticRegression()
alg.fit(xtrain, ytrain)
liner_prediction = alg.predict_proba(xval)[:,1]
liner_prediction

array([ 0.50937813,  0.51276384,  0.49872675,  0.50393049,  0.51410606,
        0.4892855 ,  0.49054901,  0.51561512,  0.49418991,  0.49431008,
        0.49386847])

In [19]:
log_loss(yval, liner_prediction)

0.69327872113959554

In [20]:
test[:10]

Unnamed: 0,Id,year,team1,team2
0,0,3021,363,161
1,1,3021,286,2
2,2,3020,232,52
3,3,3020,84,11
4,4,3021,305,39
5,5,3020,159,152
6,6,3021,198,181
7,7,3021,353,221
8,8,3020,364,363
9,9,3020,113,105


In [21]:
tmp = pd.get_dummies(train.loc[:101608, 'team1'])
tmp1 = pd.get_dummies(train.loc[:101608, 'team2'])
tmp1.columns = tmp1.columns * -1
tmp1.columns

Int64Index([  -2,   -3,   -4,   -5,   -6,   -7,   -8,  -10,  -11,  -12,
            ...
            -355, -356, -357, -358, -359, -360, -361, -362, -363, -364],
           dtype='int64', length=353)

In [22]:
tmp = pd.concat([train, tmp],axis=1)
tmp = pd.concat([tmp, tmp1],axis=1)
tmp[:10]

Unnamed: 0,year,day,team1,team2,score1,score2,target,2,3,4,...,-355,-356,-357,-358,-359,-360,-361,-362,-363,-364
0,2998,19,317,131,336,278,True,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2998,28,61,29,301,259,True,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2998,28,110,141,359,267,True,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2998,28,352,146,309,410,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2998,28,229,91,332,220,True,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2998,28,164,238,236,278,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2998,28,184,243,181,224,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2998,28,245,23,216,185,True,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2998,28,300,349,402,321,True,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2998,30,61,110,259,325,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
features1 = []
for c in tmp.columns:
    if  c!='target' and c!='score1' and c!='score2' and c!='day' and c!='team1' and c!='team2':
        features1 += [c]

xtrain = tmp.loc[itr, features1]    
xtrain[:12]

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,-355,-356,-357,-358,-359,-360,-361,-362,-363,-364
3613,2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
90593,3017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3885,2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15446,3001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86943,3017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62986,3012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87397,3017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73455,3014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92243,3018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12070,3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
alg = linear_model.LogisticRegression()
alg.fit(xtrain, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
xval = tmp.loc[ite, features1]    
xval[:12]

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,-355,-356,-357,-358,-359,-360,-361,-362,-363,-364
37078,3006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101474,3019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29858,3005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61674,3012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1049,2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20330,3002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24134,3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
345,2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63342,3012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16783,3002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
liner_prediction = alg.predict_proba(xval)[:,1]
liner_prediction

array([ 0.64440308,  0.33771205,  0.62706102,  0.82721741,  0.52073131,
        0.66979734,  0.59079784,  0.09494236,  0.43491479,  0.3701703 ,
        0.54582124])

In [27]:
log_loss(yval, liner_prediction)

0.60851074942226802

In [28]:
itmp = pd.get_dummies(test.loc[:125207, 'team1'])
itmp1 = pd.get_dummies(test.loc[:125207, 'team2'])
itmp1.columns = itmp1.columns * -1
itmp = pd.concat([test, itmp],axis=1)
itmp = pd.concat([itmp, itmp1],axis=1)

In [29]:
xtest = itmp.loc[:125206, features1]    
xtest[:7]

Unnamed: 0,year,2,3,4,5,6,7,8,10,11,...,-355,-356,-357,-358,-359,-360,-361,-362,-363,-364
0,3021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,3021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,3020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,3020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,3021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
5,3020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6,3021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [30]:
xtest.fillna(0, inplace = True)


In [31]:
liner_prediction1 = alg.predict_proba(xtest)[:,1]

In [32]:
len(liner_prediction1)

125207

In [33]:
submission = sample_submission.copy()
submission.target =  liner_prediction1 # notice here that we can refer to a column 'target' in two ways
submission.to_csv('liner_submission.csv', index=False)

### forest

In [62]:
alg = ensemble.RandomForestClassifier(15, n_jobs=4)
alg.fit(xtrain, ytrain)
prediction = alg.predict_proba(xval)[:,1]

In [63]:
submission = sample_submission.copy()
submission.target = prediction[0] # notice here that we can refer to a column 'target' in two ways
submission.to_csv('forest_submission.csv', index=False)

### xgboost решение

In [65]:

import xgboost
param = {}
param['max_depth'] = 8
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['eta'] = 0.1

numround = 100

Xdatatrain = xgboost.DMatrix(data = xtrain, label = ytrain)
Xdatatest = xgboost.DMatrix(data = xval, label = yval)

plst = list(param.items())
watchlist = [(Xdatatrain, 'train'), (Xdatatest, 'eval')]            

bst = xgboost.train(plst, Xdatatrain, numround, evals = watchlist, verbose_eval = 10)
# ypredxgb_tr = bst.predict(Xdatatrain)


ss = sample_submission.copy()

ss.target = bst.predict(xgboost.DMatrix(test[features]))
ss.to_csv('mighty_xgboost.csv', index=False)

[0]	train-logloss:0.689826	eval-logloss:0.691345
[10]	train-logloss:0.670759	eval-logloss:0.681402
[20]	train-logloss:0.658005	eval-logloss:0.67538
[30]	train-logloss:0.647507	eval-logloss:0.669858
[40]	train-logloss:0.638459	eval-logloss:0.66616
[50]	train-logloss:0.628218	eval-logloss:0.661736
[60]	train-logloss:0.619701	eval-logloss:0.658285
[70]	train-logloss:0.6126	eval-logloss:0.655751
[80]	train-logloss:0.604423	eval-logloss:0.652431
[90]	train-logloss:0.597331	eval-logloss:0.649311
