In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss
from collections import Counter

In [2]:
def parseSeed(x):
    try:
        return int(x[1:])
    except ValueError:
        return int(x[1:-1])

In [5]:
def find_seed(season, team):
    return parseSeed(seeds[(seeds.Team == team) & (seeds.Season == season)].Seed.values[0])

In [6]:
def helper(x, df):
    df.reset_index(drop=True, inplace=True)
    df_ =  df[sum(df[df.Team == x.Team].index.tolist())+1:]
    return pd.DataFrame(zip(*([x.Seed] * df_.shape[0], [x.Team] * df_.shape[0], df_.Seed, df_.Team)))

In [7]:
def matrisize_df(df):
    for i in range(df.shape[0]):
        try:
            final_df = pd.concat([final_df, helper(df.iloc[i], df)])
        except UnboundLocalError:
            final_df = helper(df.iloc[i], df)
    return final_df

In [8]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [9]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func, verbose):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, score_func='f1', n_folds=5, n_jobs=1, verbose=False):
    if mask is None:
        mask = split_mask(X)
    Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, verbose=verbose, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    train_preds = clf.predict_proba(Xtrain)
    test_preds = clf.predict_proba(Xtest)
    training_accuracy = log_loss(ytrain, train_preds)
    test_accuracy = log_loss(ytest, test_preds)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % (training_accuracy)
    print "Accuracy on test data:     %0.4f" % (test_accuracy)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest, best_params_

In [10]:
direc = 'march-machine-learning-mania-2016-v2/'

In [11]:
%%time
seasons = pd.read_csv(direc+"Seasons.csv")
data_simple = pd.read_csv(direc+"RegularSeasonCompactResults.csv")
data_detailed = pd.read_csv(direc+"RegularSeasonDetailedResults.csv")
data_simple_T = pd.read_csv(direc+"TourneyCompactResults.csv")
data_detailed_T = pd.read_csv(direc+"TourneyDetailedResults.csv")
teams = pd.read_csv(direc+"Teams.csv")
seeds = pd.read_csv(direc+"TourneySeeds.csv")
slots = pd.read_csv(direc+"TourneySlots.csv")

CPU times: user 402 ms, sys: 80.7 ms, total: 483 ms
Wall time: 513 ms


In [12]:
%%time
games = []
results = []
for i, game in data_simple_T.iterrows():
    a, b = find_seed(game.Season, game.Wteam), find_seed(game.Season, game.Lteam)
    if a == b:
        print a, game.Season,
        pass
    elif a > b:
        games.append([b,a])
        results.append(0)
    else:
        games.append([a,b])
        results.append(1)
games = pd.DataFrame(games, columns=['HigherSeed', 'LowerSeed'])
results = np.array(results)

1 1985 1 1986 1 1987 1 1988 3 1989 1 1993 1 1993 2 1995 1 1996 1 1997 1 1999 1 1999 16 2001 16 2002 1 2002 16 2003 16 2004 16 2005 1 2005 16 2006 16 2007 1 2007 16 2008 1 2008 1 2008 1 2008 16 2009 16 2010 5 2010 12 2011 16 2011 16 2011 11 2011 14 2012 16 2012 12 2012 16 2012 2 2012 16 2013 11 2013 16 2013 13 2013 4 2013 16 2014 12 2014 16 2014 11 2014 16 2015 11 2015 11 2015 16 2015 1 2015 1 2015CPU times: user 4.21 s, sys: 18.3 ms, total: 4.23 s
Wall time: 4.25 s



In [13]:
counter = games.groupby(['HigherSeed','LowerSeed']).size().reset_index(name='Occurences').sort_values(['Occurences','LowerSeed'], ascending=False).reset_index(drop=True)

In [14]:
win_pct = []
for i, c in counter.iterrows():
    s =  sum(results[games[(games.HigherSeed ==  c.HigherSeed) & (games.LowerSeed ==  c.LowerSeed)].index])
    l = len(results[games[(games.HigherSeed ==  c.HigherSeed) & (games.LowerSeed ==  c.LowerSeed)].index])
    lst = list(c.values)
    lst.append(float(s)/l*100)
    win_pct.append(lst)
win_pct = pd.DataFrame(win_pct, columns=list(counter.columns)+['HigherSeed_WinPct'])

In [15]:
index = range(8)
win_pct.iloc[index]

Unnamed: 0,HigherSeed,LowerSeed,Occurences,HigherSeed_WinPct
0,1,16,124,100.0
1,2,15,124,94.354839
2,3,14,124,83.870968
3,4,13,124,79.83871
4,5,12,124,64.516129
5,6,11,124,65.322581
6,7,10,124,61.290323
7,8,9,124,50.806452


In [16]:
index = range(8, 13)+[16,18,19]
win_pct.iloc[index]

Unnamed: 0,HigherSeed,LowerSeed,Occurences,HigherSeed_WinPct
8,2,7,73,72.60274
9,4,5,67,55.223881
10,3,6,66,57.575758
11,1,8,65,80.0
12,1,9,62,91.935484
16,2,10,44,59.090909
18,3,11,39,66.666667
19,4,12,33,63.636364


In [17]:
index = [13,14,15,17,20,21,22,25]
win_pct.iloc[index]

Unnamed: 0,HigherSeed,LowerSeed,Occurences,HigherSeed_WinPct
13,1,2,59,55.932203
14,1,4,56,69.642857
15,2,3,47,61.702128
17,1,5,41,82.926829
20,2,6,31,77.419355
21,1,3,28,57.142857
22,1,12,19,100.0
25,2,11,14,92.857143


In [18]:
clfForest = RandomForestClassifier()
#clfForest = RandomForestRegressor()
parameters = {
#    "n_estimators" : [100]
}

In [19]:
%%time
clfForest_, _, _, _, _, best_params_ = do_classify(clfForest, parameters, games, results, n_folds = 3, n_jobs = 1, 
                                                   verbose=True, score_func='log_loss')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
BEST {} -0.810324222309 [mean: -0.81032, std: 0.23986, params: {}]
############# based on standard predict ################
Accuracy on training data: 0.5061
Accuracy on test data:     0.6098
########################################################
CPU times: user 208 ms, sys: 5.58 ms, total: 214 ms
Wall time: 215 ms


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished


In [20]:
preds = []
for t1 in range(1,17):
    t2 = t1
    while t2 < 17:
        if t1 == t2:
            preds.append([t1, t2, .5])
        else:
            preds.append([t1, t2, clfForest_.predict_proba(np.array([t1,t2]).reshape(1, -1))[0][1]])
        t2 += 1

In [21]:
pd.DataFrame(preds)

Unnamed: 0,0,1,2
0,1,1,0.500000
1,1,2,0.635772
2,1,3,0.564997
3,1,4,0.686913
4,1,5,0.846502
5,1,6,0.670000
6,1,7,0.843561
7,1,8,0.833229
8,1,9,0.975806
9,1,10,1.000000


In [22]:
%%time
teams_2015 = seeds[seeds.Season == 2015]
teams_2015.loc[:,'Seed'] = teams_2015.Seed.map(parseSeed)

CPU times: user 39 ms, sys: 2.22 ms, total: 41.3 ms
Wall time: 47.5 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [23]:
df_final = matrisize_df(teams_2015)

In [24]:
def write_preds(filename, year, teams, predictions):
    with open(filename, "w") as f:
        f.write("id,pred\n")
        for t1,t2,p in zip(teams, predictions):
            f.write(str(year) + "_" + str(t1) + "_" + str(t2) + "," + str(p) + "\n")