In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [40]:
# Import historical performance data
df = pd.read_csv('Data/cbb.csv')
df_pre2015 = df[df["YEAR"] < 2015]
df = df[df["YEAR"] >= 2015]
df_pre2015.head()
#df.head()

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
5,Kentucky,SEC,40,29,117.2,96.2,0.9062,49.9,46.0,18.1,...,36.8,50.0,44.9,33.2,32.2,65.9,3.9,2ND,8.0,2014
6,Michigan,B10,38,30,121.5,93.7,0.9522,54.6,48.0,14.6,...,22.7,53.4,47.6,37.9,32.6,64.8,6.2,2ND,4.0,2013
12,Connecticut,Amer,40,32,112.5,91.3,0.9171,51.5,44.6,17.3,...,35.7,48.1,42.2,38.7,33.0,64.8,4.7,Champions,7.0,2014
13,Louisville,BE,40,35,115.9,84.5,0.9743,50.6,44.8,18.3,...,34.9,50.8,43.4,33.3,31.8,67.1,9.0,Champions,1.0,2013
34,Arizona,P12,38,33,116.2,87.4,0.9636,51.7,42.3,15.7,...,34.2,50.7,40.2,36.4,32.0,64.3,9.4,E8,1.0,2014


In [41]:
# array of team seeds that we're trying to predict
labels = np.nan_to_num(np.array(df['SEED']))  # we will fill teams that werent seeded, which are NaN, with 0 instead
labels2 = np.nan_to_num(np.array(df_pre2015['SEED']))  # we will fill teams that werent seeded, which are NaN, with 0 instead

In [42]:
# dropping seed column (hiding the answer for training) 
# we're also dropping columns that have data as Strings since RandomForest only works with numbers
# in practice, we could theoretically encode each string as a number but im lazy.
df = df.drop(['SEED', 'POSTSEASON', 'TEAM', 'CONF'], axis=1) 
df_pre2015 = df_pre2015.drop(['SEED', 'POSTSEASON', 'TEAM', 'CONF'], axis=1) 

print(df.size)
print(df_pre2015.size)
df.head()
df_pre2015.head()

56500
13960


Unnamed: 0,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,YEAR
5,40,29,117.2,96.2,0.9062,49.9,46.0,18.1,16.1,42.0,29.7,51.8,36.8,50.0,44.9,33.2,32.2,65.9,3.9,2014
6,38,30,121.5,93.7,0.9522,54.6,48.0,14.6,18.7,32.5,29.4,28.4,22.7,53.4,47.6,37.9,32.6,64.8,6.2,2013
12,40,32,112.5,91.3,0.9171,51.5,44.6,17.3,19.6,30.4,32.8,38.3,35.7,48.1,42.2,38.7,33.0,64.8,4.7,2014
13,40,35,115.9,84.5,0.9743,50.6,44.8,18.3,27.0,38.2,33.3,40.0,34.9,50.8,43.4,33.3,31.8,67.1,9.0,2013
34,38,33,116.2,87.4,0.9636,51.7,42.3,15.7,19.1,36.4,27.3,41.0,34.2,50.7,40.2,36.4,32.0,64.3,9.4,2014


In [43]:
# splitting our data into training data and test data, specifically using 25% for testing
train_features, test_features, train_labels, test_labels = train_test_split(df_pre2015, labels2, test_size = 0.25, random_state = 42)

In [44]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [45]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [46]:
# Checking Accuracy
print("Accuracy:",metrics.accuracy_score(test_labels, predictions))

Accuracy: 0.8342857142857143


In [48]:
# for the sake of illustration, im going to have the model predict the seeding of a team whose data was already in our training set
# in practice, you would be running predictions on data the model hasn't seen before
rf.predict(np.array(df_pre2015.iloc[1]).reshape(1,-1)) # predicting the 1st row in the data (2016 North Carolina team) as an example

array([4.])