In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss

In [2]:
#create train and test set
def split_mask(dftouse, split_size = 0.7):
    itrain, itest = train_test_split(xrange(dftouse.shape[0]), train_size=split_size)
    mask=np.ones(dftouse.shape[0], dtype='int')
    mask[itrain]=1
    mask[itest]=0
    mask = (mask==1)
    return mask

In [3]:
def cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func, verbose):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, verbose=verbose, scoring=score_func)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best, gs.best_params_

def do_classify(clf, parameters, X, y, mask=None, score_func='f1', n_folds=5, n_jobs=1, verbose=False):
    if mask is not None:
        mask = split_mask(X)
    Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    clf, best_params_ = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, verbose=verbose, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    train_preds = clf.predict(Xtrain)
    test_preds = clf.predict(Xtest)
    training_accuracy = log_loss(ytrain, train_preds)
    test_accuracy = log_loss(ytest, test_preds)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.4f" % (training_accuracy)
    print "Accuracy on test data:     %0.4f" % (test_accuracy)
    #print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest, best_params_

In [4]:
%%time
seasons = pd.read_csv("Seasons.csv")
data_simple = pd.read_csv("RegularSeasonCompactResults.csv")
data_detailed = pd.read_csv("RegularSeasonDetailedResults.csv")
data_simple_T = pd.read_csv("TourneyCompactResults.csv")
data_detailed_T = pd.read_csv("TourneyDetailedResults.csv")
teams = pd.read_csv("Teams.csv")
seeds = pd.read_csv("TourneySeeds.csv")
slots = pd.read_csv("TourneySlots.csv")

Wall time: 434 ms


In [5]:
def parseSeed(x):
    try:
        return int(x[1:])
    except ValueError:
        return int(x[1:-1])

In [6]:
def helper(x, df):
    df.reset_index(drop=True, inplace=True)
    df_ =  df[sum(df[df.Team == x.Team].index.tolist())+1:]
    return pd.DataFrame(zip(*([x.Seed] * df_.shape[0], [x.Team] * df_.shape[0], df_.Seed, df_.Team)))

In [19]:
def matrisize_df(df):
    for i in range(df.shape[0]):
        try:
            final_df = pd.concat([final_df, helper(df.iloc[i], df)])
        except UnboundLocalError:
            final_df = helper(df.iloc[i], df)
    return final_df

In [9]:
%%time
teams_2015 = seeds[seeds.Season == 2015]
teams_2015.loc[:,'Seed'] = teams_2015.Seed.map(parseSeed)

Wall time: 69 ms


In [20]:
df_final = matrisize_df(teams_2015)

In [22]:
df_final.head()

Unnamed: 0,0,1,2,3
0,1,1437,2,1438
1,1,1437,3,1328
2,1,1437,4,1257
3,1,1437,5,1320
4,1,1437,6,1344


In [None]:
def write_preds(filename, year, teams, predictions):
    with open(filename, "w") as f:
        f.write("id,pred\n")
        for t1,t2,p in zip(teams, predictions):
            f.write(str(year) + "_" + str(t1) + "_" + str(t2) + "," + str(p) + "\n")