In [23]:
import os
import pandas as pd
from sklearn import svm
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score
from collections import defaultdict

# Practice model on 2010-2016 Data

In [4]:
# What do we do since they changed the seeding on the data with the introduction of the first-four

tournament_result_key = {"0" : "Didn't qualify", "1": "Round of 64", "2": "Round of 32", "4": "Sweet 16", "5": "Elite 8", "7": "Final 4",
                        "8": "Last 2", "9": "Champion"}
tournament_result_after_2010 = {"0" : "Didn't qualify", "1": "First Four", "2": "Round of 64", "3": "Round of 32", "4": "Sweet 16", "5": "Elite 8", "7": "Final 4",
                        "8": "Last 2", "9": "Champion"}

In [28]:
df = pd.read_csv("/Users/eddieloyd/DIS/BigData/final/Big-Data-March-Madness/data/2010-2017.csv")


In [4]:
# Obtaining list of schools in order to normalize vectors
school_list = []

for i in range(10,17):
    practice_season = "20{}-{}".format(i, i + 1)
    # Get a filter year
    practice_year = df.loc[df["Season"].str.contains(practice_season)]
    practice_year = practice_year.fillna(0)
                           
    # Get total list of schools to normalize data
    practice_schools = practice_year["School"].to_numpy()
    
    for school in practice_schools:
        if school not in school_list:
            school_list.append(school)
                           
school_list = list(set(school_list))
school_list.sort()

In [5]:
print(school_list)

['abilene-christian', 'air-force', 'akron', 'alabama', 'alabama-state', 'alcorn-state', 'american', 'appalachian-state', 'arizona', 'arizona-state', 'arkansas', 'arkansas-pine-bluff', 'arkansas-state', 'army', 'auburn', 'austin-peay', 'ball-state', 'baylor', 'belmont', 'bethune-cookman', 'binghamton', 'boise-state', 'boston-college', 'boston-university', 'bowling-green-state', 'bradley', 'brigham-young', 'brown', 'bryant', 'bucknell', 'buffalo', 'butler', 'cal-poly', 'cal-state-bakersfield', 'cal-state-fullerton', 'cal-state-northridge', 'california', 'campbell', 'canisius', 'central-arkansas', 'central-connecticut-state', 'central-florida', 'central-michigan', 'charleston-southern', 'charlotte', 'chattanooga', 'chicago-state', 'cincinnati', 'clemson', 'cleveland-state', 'coastal-carolina', 'colgate', 'college-of-charleston', 'colorado', 'colorado-state', 'columbia', 'connecticut', 'coppin-state', 'cornell', 'creighton', 'dartmouth', 'davidson', 'dayton', 'delaware', 'delaware-state', 

In [6]:
# Set up school vectors for predicting the winers without names
# And to check winners 
school_indices = dict((c, i) for i, c in enumerate(school_list))
indices_school = dict((i, c) for i, c in enumerate(school_list))

In [10]:
# Normalize the data per game, so predictions are not influenced by games in the tournament
# This assumes that stats will not change as much is tournament, only out of necessity
for column in ['SRS','SOS', 'W-Conf', 'L-Conf', 'W-Home', 'L-Home', 'W-Away', 'L-Away',
              'Pts-Tm.', 'Pts-Opp.', 'MP', 'FG', 'FGA','3P', '3PA',
              'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',]:
    practice_year[column] = practice_year[column]/practice_year['G']
    

In [16]:
practice_data = []
practice_targets = []
for i in range(10,16):
    practice_season = "20{}-{}".format(i, i + 1)
    
    practice_year = df.loc[df["Season"].str.contains(practice_season)]
    # Get total list of schools to normalize data
    practice_schools = practice_year["School"].to_numpy()
    practice_year = practice_year.fillna(0)
    
    # For each school not in that year, we want to add a row of 0's for calculations
    for school in school_list:
        if school not in practice_schools:
    
            row = {k : school if k == "School" else 0 for k in practice_year.columns}
            row["Season"] = practice_season
            norm_season = pd.DataFrame(row, index=[0])
            practice_year = pd.concat([practice_year, norm_season], axis = 0)
    
    practice_year = practice_year.sort_values(by = ["School"],axis = 0)
    
    #Set up target vectors
    practice_target = practice_year["Tournament-Result"].to_numpy()
    practice_target = np.array(practice_target)
    
    #Set up input arrays
    practice_df = practice_year.drop(["School", "Season", "Conference", "Tournament-Result"], axis = 1)
    practice_array = [practice_df.iloc[i, :].to_numpy() for i in range(len(practice_year.index))]
    practice_array = np.array(practice_array)
    
    practice_data.append(practice_array)
    practice_targets.append(practice_target)

practice_data = np.array(practice_data)
practice_targets = np.array(practice_targets)

In [17]:
# Testing model with GaussianNB since you could pass SVC with partial data - multiple years
clf = GaussianNB()
i = 0
while i < len(practice_data):
    
    clf.partial_fit(practice_data[i], practice_targets[i], classes = [0, 1, 2, 3, 4, 5, 7, 8, 9])
    i += 1

In [18]:
clf.classes_

array([0, 1, 2, 3, 4, 5, 7, 8, 9])

In [19]:
test_season = "2016-17"

In [20]:
# Set up test array
test_year = df.loc[df["Season"].str.contains(test_season)]
test_year = test_year.fillna(0)

test_df = test_year.drop(["School", "Season", "Conference", "Tournament-Result"], axis = 1)
test_array = [test_df.iloc[i, :].to_numpy() for i in range(len(test_year.index))]
test_array = np.array(test_array)   
    

#Set up target vectors
test_target = test_year["Tournament-Result"].to_numpy()
test_target = np.array(test_target)

In [21]:
# Predicting model on following year
predictions = np.array(clf.predict_proba(test_array))

In [22]:
predictions_to_round = {"0": 0, "1": 1, "2": 2, "3":3, "4": 4, "5" : 5, "6": 7, "7": 8, "8": 9}

In [23]:
for i, team in enumerate(predictions):
    print(np.round(predictions[i]), indices_school[i])
    

[1. 0. 0. 0. 0. 0. 0. 0. 0.] abilene-christian
[1. 0. 0. 0. 0. 0. 0. 0. 0.] air-force
[0. 1. 0. 0. 0. 0. 0. 0. 0.] akron
[0. 1. 0. 0. 0. 0. 0. 0. 0.] alabama
[1. 0. 0. 0. 0. 0. 0. 0. 0.] alabama-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] alcorn-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] american
[1. 0. 0. 0. 0. 0. 0. 0. 0.] appalachian-state
[0. 0. 0. 0. 0. 1. 0. 0. 0.] arizona
[1. 0. 0. 0. 0. 0. 0. 0. 0.] arizona-state
[0. 0. 0. 0. 1. 0. 0. 0. 0.] arkansas
[0. 0. 1. 0. 0. 0. 0. 0. 0.] arkansas-pine-bluff
[1. 0. 0. 0. 0. 0. 0. 0. 0.] arkansas-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] army
[0. 1. 0. 0. 0. 0. 0. 0. 0.] auburn
[1. 0. 0. 0. 0. 0. 0. 0. 0.] austin-peay
[0. 1. 0. 0. 0. 0. 0. 0. 0.] ball-state
[0. 0. 0. 0. 1. 0. 0. 0. 0.] baylor
[1. 0. 0. 0. 0. 0. 0. 0. 0.] belmont
[1. 0. 0. 0. 0. 0. 0. 0. 0.] bethune-cookman
[1. 0. 0. 0. 0. 0. 0. 0. 0.] binghamton
[0. 0. 1. 0. 0. 0. 0. 0. 0.] boise-state
[1. 0. 0. 0. 0. 0. 0. 0. 0.] boston-college
[1. 0. 0. 0. 0. 0. 0. 0. 0.] boston-university
[1. 0. 0. 0. 0. 0. 0

In [24]:
predict = np.array(clf.predict(test_array))

In [25]:
results = []
# Getting the highest probability as a prediction for each school 
for i, team in enumerate(predict):
    prob = predictions[i].max()
    index = np.where(predictions[i] == prob)[0]
    index = str(index[0])
    ncaa_round = predictions_to_round[index]
    results.append(ncaa_round)
    print(indices_school[i], tournament_result_after_2010[str(ncaa_round)])
    

    


abilene-christian Didn't qualify
air-force Didn't qualify
akron First Four
alabama First Four
alabama-state Didn't qualify
alcorn-state Didn't qualify
american Didn't qualify
appalachian-state Didn't qualify
arizona Elite 8
arizona-state Didn't qualify
arkansas Sweet 16
arkansas-pine-bluff Round of 64
arkansas-state Didn't qualify
army Didn't qualify
auburn First Four
austin-peay Didn't qualify
ball-state First Four
baylor Sweet 16
belmont Didn't qualify
bethune-cookman Didn't qualify
binghamton Didn't qualify
boise-state Round of 64
boston-college Didn't qualify
boston-university Didn't qualify
bowling-green-state Didn't qualify
bradley Didn't qualify
brigham-young First Four
brown Didn't qualify
bryant Didn't qualify
bucknell First Four
buffalo First Four
butler Round of 32
cal-poly Didn't qualify
cal-state-bakersfield First Four
cal-state-fullerton Didn't qualify
cal-state-northridge Didn't qualify
california First Four
campbell Didn't qualify
canisius First Four
central-arkansas Di

In [26]:
accuracy = accuracy_score(results, test_target)
print(accuracy)


0.6775244299674267


In [178]:
# Predicting results of tournament game by game instead of predicting a team to a round

In [180]:
# Setting up dictionary to connect teams to their data based on names
connecting_data = defaultdict()
for i in range(10,17):
    year_data = defaultdict()
    practice_season = "20{}-{}".format(i, i + 1)
    year = str(i+1)
    # Get a filter year
    practice_year = df.loc[df["Season"].str.contains(practice_season)]
    practice_year = practice_year.fillna(0)
    practice_year = practice_year.reset_index()
    for ind in practice_year.index:
        
        team = practice_year["School"][ind]
        #print(ind, team)
        stats = practice_year.iloc[ind, :]

        year_data[team] = stats
    connecting_data[year] = year_data

In [43]:
predictions_to_round = {"0": 0, "1": 1, "2": 2, "3":3, "4": 4, "5" : 5, "6": 7, "7": 8, "8": 9}
round_to_predict = {"0":0, "1":1, "2":2, "3":3, "4":4, "5":5, "7":6, "8":7, "9": 8}

In [44]:
# Preprocessing to try and make it so it is game by game instead
# Note data already includes play in games (so no first four)
rounds = pd.read_csv("/Users/eddieloyd/DIS/BigData/final/Big-Data-March-Madness/data/Big_Dance_CSV.csv")
# Obtain only years with practice data
rounds = rounds.loc[rounds["Year"] > 2009]
rounds = rounds.loc[rounds["Year"] < 2018]
rounds = rounds.rename(columns={'Team.1': 'Team2', 'Score.1': 'Score2', 'Seed.1': 'Seed2'})
rounds

Unnamed: 0,Year,Round,Region Number,Region Name,Seed,Score,Team,Team2,Score2,Seed2
1575,2010,1,1,Midwest,1,90,Kansas,Lehigh,74,16
1576,2010,1,1,Midwest,2,68,Ohio St,Santa Barbara,51,15
1577,2010,1,1,Midwest,3,83,Georgetown,Ohio,97,14
1578,2010,1,1,Midwest,4,89,Maryland,Houston,77,13
1579,2010,1,1,Midwest,5,70,Michigan St,New Mexico St,67,12
...,...,...,...,...,...,...,...,...,...,...
2074,2017,4,3,Midwest,1,60,Kansas,Oregon,74,3
2075,2017,4,4,South,1,75,North Carolina,Kentucky,73,2
2076,2017,5,1,Final Four,7,73,South Carolina,Gonzaga,77,1
2077,2017,5,2,Final Four,3,76,Oregon,North Carolina,77,1


In [45]:
# Proceeding without changing the names until the data is fixed - so won't be able to connect stats yet
rounds_2012 = rounds.loc[rounds["Year"] == 2012]


In [113]:
def get_bracket_setup(which_round):
    round_results = []
    for ind in which_round.index:
        team1 = which_round["Team"][ind]
        score1 = which_round["Score"][ind]
        team2 = which_round["Team2"][ind]
        score2 = which_round["Score2"][ind]
        
        region_name = which_round["Region Name"][ind]
        path = which_round["Seed"][ind]
        
        round_results.append([(team1, team2),  (path, 17-path), region_name])

    return round_results

In [114]:
bracket = []
for i in range(1,6):
    round_i = rounds_2012.loc[rounds_2012["Round"] == i]
    bracket.append(get_bracket_setup(round_i))
print(bracket)

[[[('Kentucky', 'Western Kentucky'), (1, 16), 'South'], [('Duke', 'Lehigh'), (2, 15), 'South'], [('Baylor', 'South Dakota St'), (3, 14), 'South'], [('Indiana', 'New Mexico St'), (4, 13), 'South'], [('Wichita St', 'VCU'), (5, 12), 'South'], [('UNLV', 'Colorado'), (6, 11), 'South'], [('Notre Dame', 'Xavier'), (7, 10), 'South'], [('Iowa St', 'Connecticut'), (8, 9), 'South'], [('Michigan St', 'Long Island Brooklyn'), (1, 16), 'West'], [('Missouri', 'Norfolk St'), (2, 15), 'West'], [('Marquette', 'BYU'), (3, 14), 'West'], [('Louisville', 'Davidson'), (4, 13), 'West'], [('New Mexico', 'Long Beach St'), (5, 12), 'West'], [('Murray St', 'Colorado St'), (6, 11), 'West'], [('Florida', 'Virginia'), (7, 10), 'West'], [('Memphis', 'St Louis'), (8, 9), 'West'], [('Syracuse', 'UNC Asheville'), (1, 16), 'East'], [('Ohio St', 'Loyola Maryland'), (2, 15), 'East'], [('Florida St', 'St Bonaventure'), (3, 14), 'East'], [('Wisconsin', 'Montana'), (4, 13), 'East'], [('Vanderbilt', 'Harvard'), (5, 12), 'East'

In [115]:
def simulate_round(single_round):
    # Making data able to predict data and move it to the next round, to simulate tournament

    next_round = []
    for game in single_round:
        team1, team2 = game[0]
        seed1, seed2 = game[1]
        region = game[2]
                
        # Right now this is outcome, but we want this to be predictions
        outcome1, outcome2 = 1, 0
        
        result = []
        if outcome1:
            result.append((team1, (region, seed1)))
        
        if outcome2:
            result.append((team2, (region, seed2)))
            
        next_round.append(result)
        
    
    return next_round
            

In [116]:
def reduce_first_round(simulated_round):
    south =[0] * 4 
    west = [0] * 4 
    east = [0] * 4 
    midwest = [0] * 4 
    for game in simulated_round:
        team , (region, seed) = game[0]
        if seed in (1, 8, 9, 16):
            ind = 0
        elif seed in (2, 7, 10, 15):
            ind = 1
        elif seed in (3, 6, 11, 14):
            ind = 2
        elif seed in (4, 5, 12, 13):
            ind = 3
            
        if region == "South":
            if isinstance(south[ind], list):
                current = south[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                south[ind] = [team, seed, region]
                
                
        elif region == "West":
            if isinstance(west[ind], list):
                current = west[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                west[ind] = [team, seed, region]
                
        elif region == "East":
            if isinstance(east[ind], list):
                current = east[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                east[ind] = [team, seed, region]
        elif region == "Midwest":
            if isinstance(midwest[ind], list):
                current = midwest[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                midwest[ind] = [team, seed, region]
    
    final = []
    for i, game in enumerate(south):
        final.append(game)
        final.append(west[i])
        final.append(east[i])
        final.append(midwest[i])
    return final

In [117]:
def reduce_second_round(simulated_round):
    south =[0] * 2 
    west = [0] * 2 
    east = [0] * 2 
    midwest = [0] * 2 
    for game in simulated_round:
        team , (region, seed) = game[0]
        if seed in (1, 8, 9, 16, 4, 5, 12, 13):
            ind = 0
        elif seed in (2, 7, 10, 15,3, 6, 11, 14):
            ind = 1
            
        if region == "South":
            if isinstance(south[ind], list):
                current = south[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                south[ind] = [team, seed, region]
                
                
        elif region == "West":
            if isinstance(west[ind], list):
                current = west[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                west[ind] = [team, seed, region]
                
        elif region == "East":
            if isinstance(east[ind], list):
                current = east[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                east[ind] = [team, seed, region]
        elif region == "Midwest":
            if isinstance(midwest[ind], list):
                current = midwest[ind]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                midwest[ind] = [team, seed, region]
    
    final = []
    for i, game in enumerate(south):
        final.append(game)
        final.append(west[i])
        final.append(east[i])
        final.append(midwest[i])
    return final

In [118]:
def reduce_third_round(simulated_round):
    south =[0] 
    west = [0]
    east = [0]
    midwest = [0]
    for game in simulated_round:
        team , (region, seed) = game[0]
        
            
        if region == "South":
            if isinstance(south[0], list):
                current = south[0]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                south[0] = [team, seed, region]
                
                
        elif region == "West":
            if isinstance(west[0], list):
                current = west[0]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                west[0] = [team, seed, region]
                
        elif region == "East":
            if isinstance(east[0], list):
                current = east[0]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                east[0] = [team, seed, region]
        elif region == "Midwest":
            if isinstance(midwest[0], list):
                current = midwest[0]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                midwest[0] = [team, seed, region]
    
    final = []
    for i, game in enumerate(south):
        final.append(game)
        final.append(west[i])
        final.append(east[i])
        final.append(midwest[i])
    return final

In [146]:
def reduce_final_four(simulated_round):
    final_four = [0] * 2
    for game in simulated_round:
        team , (region, seed) = game[0]
        
        # South plays east so put them in the same game
        if region == "South":
            if isinstance(final_four[0], list):
                current = final_four[0]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                final_four[0] = [team, seed, region]
                
        # West plays midwest so put them in the same game
        elif region == "West":
            if isinstance(final_four[1], list):
                current = final_four[1]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                final_four[1] = [team, seed, region]
                
        elif region == "East":
            if isinstance(final_four[0], list):
                current = final_four[0]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                final_four[0] = [team, seed, region]
                
        elif region == "Midwest":
            if isinstance(final_four[1], list):
                current = final_four[1]
                cur_team = current[0]
                current[0] = (cur_team, team)
                cur_seed = current[1]
                current[1] = (cur_seed, seed)
            else:
                final_four[1] = [team, seed, region]
    
    return final_four

In [148]:
def reduce_championship(simulated_round):
    championship = [0]
    for game in simulated_round:
        team , (region, seed) = game[0]
        if isinstance(championship[0], list):
            current = championship[0]
            cur_team = current[0]
            current[0] = (cur_team, team)
            cur_seed = current[1]
            current[1] = (cur_seed, seed)
        else:
            championship[0] = [team, seed, region]
                
    return championship

In [175]:
def simulate_tournament(first_round):
    
    # May want to update this so that it returns what teams are out each round as well
    round_64_games = simulate_round(first_round)
    round_64_complete = reduce_first_round(round_64_games)
    print("Round of 64 winners: ")
    print(round_64_games)
    print("---------------------------------")
    
    round_32_games = simulate_round(round_64_complete)
    round_32_complete = reduce_second_round(round_32_games)
    print("Round of 32 winners: ")
    print(round_32_games)
    print("---------------------------------")
    
    round_16_games = simulate_round(round_32_complete)
    round_16_complete = reduce_third_round(round_16_games)
    print("Sweet 16 winners: ")
    print(round_16_games)
    print("---------------------------------")
    
    elite_8_games = simulate_round(round_16_complete)
    elite_8_complete = reduce_final_four(elite_8_games)
    print("Elite 8 winners: ")
    print(elite_8_games)
    print("---------------------------------")
    
    final_four_complete = simulate_round(elite_8_complete)
    print("Final 4 winners: ")
    print(final_four_complete)
    print("---------------------------------")
    
    championship_round = reduce_championship(final_four_complete)
    championship_complete = simulate_round(championship_round)
    print("Championship results: ")
    print(championship_complete)
    print("---------------------------------")


In [176]:
simulate_tournament(bracket[0])

Round of 64 winners: 
[[('Kentucky', ('South', 1))], [('Duke', ('South', 2))], [('Baylor', ('South', 3))], [('Indiana', ('South', 4))], [('Wichita St', ('South', 5))], [('UNLV', ('South', 6))], [('Notre Dame', ('South', 7))], [('Iowa St', ('South', 8))], [('Michigan St', ('West', 1))], [('Missouri', ('West', 2))], [('Marquette', ('West', 3))], [('Louisville', ('West', 4))], [('New Mexico', ('West', 5))], [('Murray St', ('West', 6))], [('Florida', ('West', 7))], [('Memphis', ('West', 8))], [('Syracuse', ('East', 1))], [('Ohio St', ('East', 2))], [('Florida St', ('East', 3))], [('Wisconsin', ('East', 4))], [('Vanderbilt', ('East', 5))], [('Cincinnati', ('East', 6))], [('Gonzaga', ('East', 7))], [('Kansas St', ('East', 8))], [('North Carolina', ('Midwest', 1))], [('Kansas', ('Midwest', 2))], [('Georgetown', ('Midwest', 3))], [('Michigan', ('Midwest', 4))], [('Temple', ('Midwest', 5))], [('San Diego St', ('Midwest', 6))], [('St Marys', ('Midwest', 7))], [('Creighton', ('Midwest', 8))]]
---

In [177]:
# Code below is for neural network model that classifies to specific round rather than predict team by team

In [179]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow import keras

In [230]:
# Testing out two different models
# First
# Neural Network that takes in single teams data to predict its success in tournament

In [312]:
# Create different data so teams from practice data is all in one array
new_practice = []
for year in practice_data:
    for team in year:
        new_practice.append(team)
new_practice = np.array(new_practice)

new_prac_target = []
for year in practice_targets:
    for num in year:
        new_vec = [0] * 9
        ind = round_to_predict[str(num)]
        new_vec[ind] = 1
        new_prac_target.append(new_vec)

new_prac_target = np.array(new_prac_target)
print(new_practice.shape)
print(new_prac_target.shape)

(1842, 48)
(1842, 9)


In [323]:
new_target = []
for target in test_target:
    for tar in target:
        new_vec = [0] * 9
        ind = round_to_predict[str(tar)]
        new_vec[ind] = 1
        new_target.append(new_vec)

new_target = np.array(new_target)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [350]:
model = Sequential()
model.add(Dense(512, input_shape=(48, ), activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(9, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer="adam",
    metrics=['categorical_crossentropy', 'accuracy']
)

In [351]:
history = model.fit(new_practice, new_prac_target,
          batch_size=15,
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [358]:
pred = model.predict(test_array)



In [359]:
results = []
# Getting the highest probability as a prediction for each school 
for i, team in enumerate(pred):
    print(pred[i])
    prob = pred[i].max()
    index = np.where(pred[i] == prob)[0]
    index = str(index[0])
    ncaa_round = predictions_to_round[index]
    results.append(ncaa_round)
    print(indices_school[i], tournament_result_after_2010[str(ncaa_round)])
    

    

[9.9859089e-01 1.1509868e-03 2.4699594e-04 7.9701922e-06 2.2472000e-06
 8.5851059e-07 4.1418943e-08 1.3530710e-12 7.9809199e-09]
abilene-christian Didn't qualify
[9.83751595e-01 1.07815517e-02 5.03075635e-03 3.08650429e-04
 1.02645674e-04 1.31981978e-05 1.10126775e-05 4.54389486e-08
 5.52059305e-07]
air-force Didn't qualify
[6.4330280e-01 9.7343683e-02 1.8772660e-01 5.5082317e-02 1.0351835e-02
 1.4723731e-03 3.4189948e-03 7.7095680e-04 5.3045916e-04]
akron Didn't qualify
[8.0641150e-01 7.1057566e-02 9.9280924e-02 1.6281338e-02 4.4559333e-03
 5.1623571e-04 1.6517715e-03 1.1864416e-04 2.2613800e-04]
alabama Didn't qualify
[9.9950600e-01 4.3862613e-04 5.4495496e-05 8.1767564e-07 9.7661719e-08
 4.1859880e-09 3.8945731e-09 1.8768290e-14 3.4133675e-11]
alabama-state Didn't qualify
[9.9325949e-01 3.4833117e-03 3.1313056e-03 6.8883717e-05 4.7567883e-05
 3.3971457e-06 5.7219977e-06 2.9047089e-09 2.4661142e-07]
alcorn-state Didn't qualify
[9.9992168e-01 5.0830768e-05 2.6840475e-05 3.2287389e-07 

In [277]:
# Second
# LSTM that takes in a teams stats and predicts its success over the next couple years (and tournament results)