In [24]:
import numpy as np
import pandas as pd
from datetime import datetime, date
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Read in Games, Teams, Matchups datasets

In [25]:
game = pd.read_csv('games.csv')
teams = pd.read_csv('teams.csv')
matchups = pd.read_csv('matchups_early.csv')
matchups.reset_index(drop=True).head(2)

Unnamed: 0,Visitor,Home,R-Coefficient,Home_Score,Visitor_Score,Prediction,Winner,Logic
0,ATL,ORL,,,,,ATL,
1,MIL,CHI,,,,,MIL,


## Merge game dataframes, drop rows with NAN values in OREB, PLUS_MINUS columns

In [26]:
game.drop(['TEAM_ID_home','TEAM_ID_away', 'GAME_STATUS_TEXT', 'GAME_DATE_EST'], axis=1, inplace=True)
game.drop_duplicates()
game.head(2)

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,41300405,1610612759,1610612748,2013,104.0,0.474,0.783,0.462,25.0,40.0,87.0,0.4,0.741,0.28,14.0,41.0,1
1,41300404,1610612748,1610612759,2013,86.0,0.451,0.65,0.409,13.0,27.0,107.0,0.571,0.72,0.429,25.0,44.0,0


In [27]:
spread_predict = []
record_predict = []
score_predict = []
winner_predict = []
hscore = []
vscore = []
rf_predictor = []
predictor = []

for x in range(len(matchups)):
     
    home = matchups.iloc[x]['Home']
    visitor = matchups.iloc[x]['Visitor']

    for i in range(len(teams)):
        if teams.iloc[i]['ABBREVIATION'] == home:
            home_id = teams.iloc[i]['TEAM_ID']

        if teams.iloc[i]['ABBREVIATION'] == visitor:
            visitor_id = teams.iloc[i]['TEAM_ID']

#Filter games to matched home, visitor teams
    game = game[game['HOME_TEAM_ID'] == home_id]
    game = game[game['VISITOR_TEAM_ID'] == visitor_id]
    
#Drop dates, team_ids, game_ids, season - will not be useful
    game.drop(['GAME_ID','HOME_TEAM_ID', 
            'VISITOR_TEAM_ID', 'SEASON'], axis=1)
    game.dropna(inplace=True)
    
#Split Dataset for training, test
    X = game.drop("HOME_TEAM_WINS", axis=1)
    y = game["HOME_TEAM_WINS"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

#Calculate Historical Win%
    wins = 0
    for j in range(len(y)):
        wins = wins + round(y.iloc[j])
    record = wins / len(y)    

#Random Forest Classification
    rf = RandomForestClassifier(n_estimators=200)
    rf = rf.fit(X_train, y_train)
    
# Fitting our model with all of our features in X
    score = rf.score(X, y)
    predictor.append(rf.predict(X).mean()- 0.5*np.var(rf.predict(X))/record)

# Calculate Spread
    home_score = game['PTS_home'].mean()
    visitor_score = game['PTS_away'].mean()
    spread = home_score - visitor_score

# Append Scoring Lists
    spread_predict.append(spread)
    record_predict.append(record)
    score_predict.append(score)
    hscore.append(home_score)
    vscore.append(visitor_score)

ValueError: With n_samples=0, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

## Populate matchups dataframe, write to CSV

In [None]:
matchups['Home_Score'] = hscore
matchups['Visitor_Score'] = vscore
matchups['R-Coefficient'] = score_predict
matchups['Historical Home Win Record'] = record_predict

win_predictor = []
win_logic = []

for i in range (len(matchups)):
    if predictor[i] > 0.5:
        win_predictor.append(matchups.iloc[i]['Home'])
    else:
        win_predictor.append(matchups.iloc[i]['Visitor'])
            
#Calculate Logic - 1 if correct, 0 if incorrect
    if win_predictor[i] == matchups.iloc[i]['Winner']:
        win_logic.append(1)
    else:
        win_logic.append(0)

matchups['Prediction'] = win_predictor
matchups['Logic'] = win_logic
matchups.to_csv('matchup_output_early.csv')

#matchups prediction accuracy
print((sum(win_logic) / len(win_logic)), sum(score_predict)/len(score_predict))