In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression

Reorganizing player data.
The object of this code is to organize the data into the following for machine learning:  

Inputs :   
    The differences between (averaged) stats of competing players for a specific pair of teams.  
    For example, if the game is Team1 vs Team2, and Team1 will play 5 players who (on average) score [20,15,13,12,11], and Team2 will play 7 players who (on average) score [12,10,11,13,14,9,5], then the SCORE stat will be $\frac{1}{5}\left(20+15+13+12+11\right) - \frac{1}{7}\left(12+10+11+13+14+9+5\right) \approx 3.62857$.  

Output :  
    Target value will be the score margin (so if Team1 scored 67 points and Team2 score 73, the margin would be -6).

In [2]:
# First, we load in data to select players who will play
path = './DATA/'

train = []
test = []
# We'll also save players names as we go
players_train = []
players_test = []

# Walk through player files
for dir_path , dir_name , file_names in os.walk(path) :
    # 2017 will be our testing set
    if '2017' in dir_path :
        for name in file_names :
            # Grab avgs file
            if name[-4:] == 'avgs' :
                data = pd.read_csv(os.path.join(dir_path,name))
                # This puts the players name in order of their row in the training matrix
                for i in range(data.shape[0]) :
                    players_test.append(data.iloc[i,0])
                data = data.drop(['Unnamed: 0'],axis=1)
                if isinstance(test,list) :
                    test = data.as_matrix()
                else :
                    test = np.vstack((test,data))
    # Everything else will become our training set
    elif '2018' not in dir_path :
        for name in file_names :
            # Grab avgs file
            if name[-4:] == 'avgs' :
                data = pd.read_csv(os.path.join(dir_path,name))
                # This puts the players name in order of their row in the training matrix
                for i in range(data.shape[0]) :
                    players_train.append(data.iloc[i,0])
                data = data.drop(['Unnamed: 0'],axis=1)
                if isinstance(train,list) :
                    train = data.as_matrix()
                else :
                    train = np.vstack((train,data.as_matrix()))

# From the way the data is saved, the last column is whether or not the player
#     is considered a major contributor during the season.
train_x = train[:,1:3]
train_y = train[:,-1]
test_x = test[:,1:3]
test_y = test[:,-1]

In [3]:
lr = LogisticRegression()
lr.fit(train_x,train_y)
res = lr.predict(test_x)
print('% Correctly Identified Players: '+str(lr.score(test_x,test_y)))

% Correctly Identified Players: 0.8671875


In [4]:
playing = list(np.array(players_test)[res == 1])
print('# of players predicted to sit on the bench: '+str(len(players_test)-len(playing)))

# of players predicted to sit on the bench: 297


In [5]:
false_acc = np.array(test_y - res)
true_acc = np.array(test_y + res)
num_preds = len(res)
TP = sum(true_acc == 2)
TN = sum(true_acc == 0)
FN = sum(false_acc == 1)
FP = sum(false_acc == -1)
print('Played, predicted they sat: {}'.format(FN))
print('Sat, predicted they played: {}'.format(FP))
print('Total numb. of predictions: {}'.format(num_preds))
print('False Negative Rate       : {}'.format(FN/num_preds))
print('True Positive Rate        : {}'.format(TP/num_preds))

Played, predicted they sat: 36
Sat, predicted they played: 66
Total numb. of predictions: 768
False Negative Rate       : 0.046875
True Positive Rate        : 0.52734375


In [6]:
# CAN'T GET THIS TO WORK
# Filter out non-contributors
#new_players_train = [players_train[i] for i in range(len(train_y)) if train_y[i] == 1]
#new_players_test = [players_test[i] for i in range(len(res)) if res[i] == 1]
#players_train = new_players_train
#players_test = new_players_test

In [27]:
# This is to condense each team into a single row of statistics
cols = ['MP','ORtg','%Ps','Pts','OR','DR','A','TO','Blk','Stl','PF','2Pt %','3Pt %','FT %','2Pt %Att','3Pt %Att','FT %Att','Pnts-Prev','Marg']
# Collect yearly avgs dataframes for further use
yearly_avgs = []
final_data = []
final_teams = []
# Walk through player files
for dir_path , dir_name , file_names in os.walk(path) :
    if dir_path[-4:] in ['2013','2014','2015','2016','2017'] :
        year = dir_path[-4:]
    for name in file_names :
        # Grab avgs file
        if name[-4:] == 'avgs' and '2018' not in dir_path :
            team_name = dir_path[12:]
            final_teams.append(team_name+' '+year)
            data = pd.read_csv(os.path.join(dir_path,name))
            # This averages player's contribution to the team
            total = np.zeros((1,train.shape[1]-1))
            # Count how many people will be playing
            num_players = 0
            for i in range(data.shape[0]) :
                # Get the player name
                pl_name = (data.iloc[i,0])
                # If playing, append to final_data
                if pl_name in players_train :
                    ind = players_train.index(pl_name)
                    total += train[ind,:-1]
                    num_players = num_players + 1
            # If there weren't 5 players picked, grab everyone
            if num_players < 5 :
                for i in range(data.shape[0]) :
                    # Get the player name
                    pl_name = (data.iloc[i,0])
                    ind = players_test.index(pl_name)
                    total += train[ind,:-1]
                    num_players = num_players + 1
            # Divide by num_players to finish average
            total /= num_players
            total = total[0]
            # Add to final_data
            if isinstance(final_data,list) :
                final_data = np.array(total)
            else :
                final_data = np.vstack((final_data,total))
    if not isinstance(final_data,list) and final_data.shape[0] == 68 :
        # Save it to a csv file
        final = pd.DataFrame(data=final_data,columns=cols,index=final_teams)
        yearly_avgs.append(final)
        final.to_csv('Data_2/All_avgs_'+year+'.csv')
        final_data = []
        final_teams = []

In [55]:
sc_rows = ['Adj. Efficiency','Adj. Tempo','Avg. Poss. Length','Effective FG%:','Turnover %:','Off. Reb. %:','FTA/FGA:','3P%:','2P%:','FT%:','Block%:','Steal%:','3PA/FGA:','A/FGM:','3-Pointers:','2-Pointers:','Free Throws:','Components:','Overall:','Non-conference:','Bench Minutes:','Experience:','Minutes Continuity:','Average Height:']
# This creates the difference file
# It's the difference between the average teams stats and the result of the game
# Result: 0 means the second team won, 1 the first
# For each year of interest (previous 4)
for final in yearly_avgs :
    year = '20' + final.index[0][-2:]
    new_data = []
    new_sc_data = []
    # Go through each team once
    for team in final.index :
        team_name = team[:-5]
        temp_path = path + year + '/' + team_name
        # Get average team player stats
        team_1 = final.loc[team]
        # Get average team stats (scouting report)
        sc_data = pd.read_csv(temp_path+'/Scouting_Report_csv')
        sc_data = sc_data.drop('Unnamed: 0',axis=1)
        sc_data = sc_data.drop(14,axis=0)
        sc1_off_data = np.array(sc_data['Offense']).reshape(24)
        sc1_def_data = np.array(sc_data['Defense']).reshape(24)
        sc1_data = np.hstack((sc1_off_data.astype(float),sc1_def_data.astype(float)))
        # Find that team's schedule for the year
        for dir_path, dir_name, filename in os.walk(temp_path) :
            for name in filename :
                if name[-3:] == 'csv' :
                    # Read in schedule and filter to just tournament games
                    data = pd.read_csv(temp_path+'/'+name)
                    data = data.drop('Unnamed: 0',axis=1)
                    data = data[data['Conference'] == 'NCAA-T']
                    # For each game, ceate a row
                    for game in data.index :
                        res = data.loc[game]['Result'][0]
                        if res == 'L' :
                            result = np.array(0).reshape(1)
                        else :
                            result = np.array(1).reshape(1)
                        # This is important if they played more than one game in the tournament
                        if sc1_data.shape[0] != 49 :
                            sc1_data = np.hstack((sc1_data,result))
                        else :
                            sc1_data[-1] = result
                        # Get opponent name and stats
                        opponent = data.loc[game]['Opponent'] + ' ' + team[-4:]
                        team_2 = final.loc[opponent]
                        # Save into data
                        new_info = np.hstack((np.array(team_1-team_2).reshape(19),result))
                        if isinstance(new_data,list) :
                            new_data = new_info
                        else :
                            new_data = np.vstack((new_data,new_info))
                        # GET SCOUTING REPORT DATA HERE
                        n = len(team_name)
                        sc_data = pd.read_csv(temp_path[:-n]+opponent[:-5]+'/Scouting_Report_csv')
                        sc_data = sc_data.drop('Unnamed: 0',axis=1)
                        sc_data = sc_data.drop(14,axis=0)
                        sc2_off_data = np.array(sc_data['Offense']).reshape(24)
                        sc2_def_data = np.array(sc_data['Defense']).reshape(24)
                        sc2_data = np.hstack((sc2_off_data.astype(float),sc2_def_data.astype(float)))
                        sc2_data = np.hstack((sc2_data,np.array(0).reshape(1)))
                        if isinstance(new_sc_data,list) :
                            new_sc_data = sc1_data - sc2_data
                        else :
                            new_sc_data = np.vstack((new_sc_data,sc1_data-sc2_data))
                    # We only need one schedule per team, so break
                    break
    # Save Data
    reorganized = pd.DataFrame(data=new_data,columns=(cols+['Result']))
    reorganized.to_csv('Data_2/New_Form_Data_'+year)
    reorg_teams = pd.DataFrame(data=new_sc_data)
    reorg_teams.to_csv('Data_2/New_Form_Team_'+year)
    combo = np.hstack((new_sc_data[:,:-1],new_data))
    reorg_combo = pd.DataFrame(data=combo)
    reorg_combo.to_csv('Data_2/New_Form_combo_'+year)

(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,)
(49,