In [114]:
import os
import pandas as pd
from sklearn import svm
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score

# Practice model on 2010-2016 Data

In [134]:
# What do we do since they changed the seeding on the data with the introduction of the first-four

tournament_result_key = {"0" : "Didn't qualify", "1": "Round of 64", "2": "Round of 32", "4": "Sweet 16", "5": "Elite 8", "7": "Final 4",
                        "8": "Last 2", "9": "Champion"}
tournament_result_after_2010 = {"0" : "Didn't qualify", "1": "First Four", "2": "Round of 64", "3": "Round of 32", "4": "Sweet 16", "5": "Elite 8", "7": "Final 4",
                        "8": "Last 2", "9": "Champion"}

In [135]:
df = pd.read_csv("/Users/eddieloyd/DIS/BigData/final/Big-Data-March-Madness/data/2010-2017.csv")

In [136]:
# Obtaining list of schools in order to normalize vectors
school_list = []

for i in range(10,17):
    practice_season = "20{}-{}".format(i, i + 1)
    # Get a filter year
    practice_year = df.loc[df["Season"].str.contains(practice_season)]
    practice_year = practice_year.fillna(0)
                           
    # Get total list of schools to normalize data
    practice_schools = practice_year["School"].to_numpy()
    
    for school in practice_schools:
        if school not in school_list:
            school_list.append(school)
                           
school_list = list(set(school_list))
school_list.sort()

In [137]:
print(school_list)

['abilene-christian', 'air-force', 'akron', 'alabama', 'alabama-state', 'alcorn-state', 'american', 'appalachian-state', 'arizona', 'arizona-state', 'arkansas', 'arkansas-pine-bluff', 'arkansas-state', 'army', 'auburn', 'austin-peay', 'ball-state', 'baylor', 'belmont', 'bethune-cookman', 'binghamton', 'boise-state', 'boston-college', 'boston-university', 'bowling-green-state', 'bradley', 'brigham-young', 'brown', 'bryant', 'bucknell', 'buffalo', 'butler', 'cal-poly', 'cal-state-bakersfield', 'cal-state-fullerton', 'cal-state-northridge', 'california', 'campbell', 'canisius', 'central-arkansas', 'central-connecticut-state', 'central-florida', 'central-michigan', 'charleston-southern', 'charlotte', 'chattanooga', 'chicago-state', 'cincinnati', 'clemson', 'cleveland-state', 'coastal-carolina', 'colgate', 'college-of-charleston', 'colorado', 'colorado-state', 'columbia', 'connecticut', 'coppin-state', 'cornell', 'creighton', 'dartmouth', 'davidson', 'dayton', 'delaware', 'delaware-state', 

In [138]:
# Set up school vectors for predicting the winers without names
# And to check winners 
school_indices = dict((c, i) for i, c in enumerate(school_list))
indices_school = dict((i, c) for i, c in enumerate(school_list))

In [139]:
practice_data = []
practice_targets = []
for i in range(10,16):
    practice_season = "20{}-{}".format(i, i + 1)
    
    practice_year = df.loc[df["Season"].str.contains(practice_season)]
    # Get total list of schools to normalize data
    practice_schools = practice_year["School"].to_numpy()
    practice_year = practice_year.fillna(0)
    
    # For each school not in that year, we want to add a row of 0's for calculations
    for school in school_list:
        if school not in practice_schools:
    
            row = {k : school if k == "School" else 0 for k in practice_year.columns}
            row["Season"] = practice_season
            norm_season = pd.DataFrame(row, index=[0])
            practice_year = pd.concat([practice_year, norm_season], axis = 0)
    
    practice_year = practice_year.sort_values(by = ["School"],axis = 0)
    
    #Set up target vectors
    practice_target = practice_year["Tournament-Result"].to_numpy()
    practice_target = np.array(practice_target)
    
    #Set up input arrays
    practice_df = practice_year.drop(["School", "Season", "Conference", "Tournament-Result"], axis = 1)
    practice_array = [practice_df.iloc[i, :].to_numpy() for i in range(len(practice_year.index))]
    practice_array = np.array(practice_array)
    
    practice_data.append(practice_array)
    practice_targets.append(practice_target)
    

In [140]:
# Testing model with GaussianNB since you could pass SVC with partial data - multiple years
clf = GaussianNB()
i = 0
while i < len(practice_data):
    
    clf.partial_fit(practice_data[i], practice_targets[i], classes = [0, 1, 2, 3, 4, 5, 7, 8, 9])
    i += 1

In [141]:
clf.classes_

array([0, 1, 2, 3, 4, 5, 7, 8, 9])

In [142]:
test_season = "2016-17"

In [143]:
# Set up test array
test_year = df.loc[df["Season"].str.contains(test_season)]
test_year = test_year.fillna(0)

test_df = test_year.drop(["School", "Season", "Conference", "Tournament-Result"], axis = 1)
test_array = [test_df.iloc[i, :].to_numpy() for i in range(len(test_year.index))]
test_array = np.array(test_array)   
    

#Set up target vectors
test_target = test_year["Tournament-Result"].to_numpy()
test_target = np.array(test_target)

In [144]:
# Predicting model on following year
predictions = np.array(clf.predict_proba(test_array))

In [145]:
predictions_to_round = {"0": 0, "1": 1, "2": 2, "3":3, "4": 4, "5" : 5, "6": 7, "7": 8, "8": 9}

In [146]:
for i, team in enumerate(predictions):
    print(np.round(predictions[i]), indices_school[i])
    

[1. 0. 0. 0. 0. 0. 0. 0. 0.] abilene-christian
[1. 0. 0. 0. 0. 0. 0. 0. 0.] air-force
[0. 1. 0. 0. 0. 0. 0. 0. 0.] akron
[0. 1. 0. 0. 0. 0. 0. 0. 0.] alabama
[1. 0. 0. 0. 0. 0. 0. 0. 0.] alabama-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] alcorn-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] american
[1. 0. 0. 0. 0. 0. 0. 0. 0.] appalachian-state
[0. 0. 0. 0. 0. 1. 0. 0. 0.] arizona
[1. 0. 0. 0. 0. 0. 0. 0. 0.] arizona-state
[0. 0. 0. 0. 1. 0. 0. 0. 0.] arkansas
[0. 0. 1. 0. 0. 0. 0. 0. 0.] arkansas-pine-bluff
[1. 0. 0. 0. 0. 0. 0. 0. 0.] arkansas-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] army
[0. 1. 0. 0. 0. 0. 0. 0. 0.] auburn
[1. 0. 0. 0. 0. 0. 0. 0. 0.] austin-peay
[0. 1. 0. 0. 0. 0. 0. 0. 0.] ball-state
[0. 0. 0. 0. 1. 0. 0. 0. 0.] baylor
[1. 0. 0. 0. 0. 0. 0. 0. 0.] belmont
[1. 0. 0. 0. 0. 0. 0. 0. 0.] bethune-cookman
[1. 0. 0. 0. 0. 0. 0. 0. 0.] binghamton
[0. 0. 1. 0. 0. 0. 0. 0. 0.] boise-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] boston-college
[1. 0. 0. 0. 0. 0. 0. 0. 0.] boston-university
[1. 0. 0. 0. 0. 0. 0

In [147]:
predict = np.array(clf.predict(test_array))

In [150]:
results = []
# Getting the highest probability as a prediction for each school 
for i, team in enumerate(predict):
    prob = predictions[i].max()
    index = np.where(predictions[i] == prob)[0]
    index = str(index[0])
    ncaa_round = predictions_to_round[index]
    results.append(ncaa_round)
    print(indices_school[i], tournament_result_after_2010[str(ncaa_round)])
    

    


abilene-christian Didn't qualify
air-force Didn't qualify
akron First Four
alabama First Four
alabama-state Didn't qualify
alcorn-state Didn't qualify
american Didn't qualify
appalachian-state Didn't qualify
arizona Elite 8
arizona-state Didn't qualify
arkansas Sweet 16
arkansas-pine-bluff Round of 64
arkansas-state Didn't qualify
army Didn't qualify
auburn First Four
austin-peay Didn't qualify
ball-state First Four
baylor Sweet 16
belmont Didn't qualify
bethune-cookman Didn't qualify
binghamton Didn't qualify
boise-state Round of 64
boston-college Didn't qualify
boston-university Didn't qualify
bowling-green-state Didn't qualify
bradley Didn't qualify
brigham-young First Four
brown Didn't qualify
bryant Didn't qualify
bucknell First Four
buffalo First Four
butler Round of 32
cal-poly Didn't qualify
cal-state-bakersfield First Four
cal-state-fullerton Didn't qualify
cal-state-northridge Didn't qualify
california First Four
campbell Didn't qualify
canisius First Four
central-arkansas Di

In [151]:
accuracy = accuracy_score(results, test_target)
print(accuracy)


0.6775244299674267


Open questions:
>    1. Do we update data prior to 2010 to account for First-Four?
>   2. What model should we be using? (A SVC requires only two dimensions so we couldn't pass multiple years to it unless we combine all the data into one big input, rather than having separations by year and then there might be issues with the inputs - do we want to try a Neural Network?)
>    3. I'm pretty sure our data contains games from the NCAA tournament itself, so the data we would input isn't actually representative of inputting data right before a tournament to predict the winner (the model can probably tell who makes /how far they go in the tournament based on how many games they've played), so is there a way we could change that?
>    4. How can we increase accuracy score?

In [131]:
# Test SVC while just appending all the data to two dimensions instead of three

In [None]:
#Training model on many years
#clf = svm.SVC(decision_function_shape='ovo', probability = True)
#clf.fit(practice_data, practice_targets)