In [190]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime, date
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier


## Read in Games, Teams, Matchups datasets

In [191]:
games = pd.read_csv('games.csv')
teams = pd.read_csv('teams.csv')
matchups = pd.read_csv('matchups_early.csv')
matchups.reset_index(drop=True).head(2)

Unnamed: 0,Visitor,Home,R-Coefficient,Home_Score,Visitor_Score,Prediction,Winner,Logic
0,ATL,ORL,,,,,ATL,
1,MIL,CHI,,,,,MIL,


## Merge game dataframes, drop rows with NAN values in OREB, PLUS_MINUS columns

In [192]:
games.drop(['TEAM_ID_home','TEAM_ID_away', 'GAME_STATUS_TEXT', 'GAME_DATE_EST'], axis=1, inplace=True)
games.drop_duplicates()
games.head(2)

Unnamed: 0,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,41300405,1610612759,1610612748,2013,104.0,0.474,0.783,0.462,25.0,40.0,87.0,0.4,0.741,0.28,14.0,41.0,1
1,41300404,1610612748,1610612759,2013,86.0,0.451,0.65,0.409,13.0,27.0,107.0,0.571,0.72,0.429,25.0,44.0,0


In [193]:
spread_predict = []
record_predict = []
score_predict = []
winner_predict = []
hscore = []
vscore = []
rf_predictor = []

for x in range(len(matchups)):
      
    home = matchups.iloc[x]['Home']
    visitor = matchups.iloc[x]['Visitor']
    game = games

    for i in range(len(teams)):
        if teams.iloc[i]['ABBREVIATION'] == home:
            home_id = teams.iloc[i]['TEAM_ID']

        if teams.iloc[i]['ABBREVIATION'] == visitor:
            visitor_id = teams.iloc[i]['TEAM_ID']

#Narrow games to matched home, visitor teams, create a mirror dataset reversing home / away teams (added data)
    
    game = game[game['HOME_TEAM_ID'] == home_id]
    game = game[game['VISITOR_TEAM_ID'] == visitor_id]
    
    
#Drop dates, team_ids, game_ids, season - will not be useful
    game.drop(['GAME_ID','HOME_TEAM_ID', 
            'VISITOR_TEAM_ID', 'SEASON'], axis=1)
    game.dropna(inplace=True)


#Split Dataset for training, test
    X = game.drop("HOME_TEAM_WINS", axis=1)
    y = game["HOME_TEAM_WINS"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

#Linear Regression 
    model = ElasticNet()

#Random Forest
    rf = RandomForestClassifier(n_estimators=200)
    rf = rf.fit(X_train, y_train)
    r = rf.predict(X).mean()
    rf_predictor.append(r)
    
# Fitting our model with all of our features in X
    model.fit(X, y)
    
    score = model.score(X, y)
    predict = pd.DataFrame({"Prediction": model.predict(X_test), "Actual": y_test}).reset_index(drop=True)
    contribution = pd.DataFrame({"Parameter": list(X.columns), "Weight": model.coef_ }).reset_index(drop=True)
    
# Calculate Predicted Win%
    wins = 0
    for j in range(len(predict)):
        wins = wins + round(predict.iloc[j]['Prediction'])
    record = wins / len(predict)

# Calculate Spread
    home_score = game['PTS_home'].mean()
    visitor_score = game['PTS_away'].mean()
    spread = home_score - visitor_score

# Write to DataFrame
    spread_predict.append(spread)
    record_predict.append(record)
    score_predict.append(score)
    hscore.append(home_score)
    vscore.append(visitor_score)

0.5564612731303595 0.5813953488372093
0.6892064181305717 0.6739130434782609
0.46827121949021155 0.875
0.735984433466782 0.35135135135135137
0.688869274766434 0.9333333333333333
0.7200753065578502 0.5405405405405406
0.6379742202198482 0.3
0.7067982564774951 0.43243243243243246
0.7039681339121305 0.6774193548387096
0.6089330672051907 0.5121951219512195
0.7702163085492096 0.6896551724137931
0.6061087537756662 0.55
0.6051810709660849 0.8648648648648649
0.6569581358904673 0.59375
0.6532933847070904 0.6428571428571429
0.6179915725991282 0.25
0.7127687604378442 0.6136363636363636
0.49181709995739226 0.8620689655172413
0.6447687470008318 0.6875
0.6740979927033437 0.2903225806451613
0.4531628150185075 0.7727272727272727
0.6367949971040734 0.4375
0.7453232171067576 0.4444444444444444
0.742781922775201 0.6842105263157895
0.7321757199968927 0.6875
0.5972458517267629 0.6875
0.6452018321496189 0.55
0.5966844563171962 0.7368421052631579
0.68742191492205 0.7333333333333333
0.838910004767177 0.5625
0.7

## Populate matchups dataframe, write to CSV

In [194]:
matchups['Home_Score'] = hscore
matchups['Visitor_Score'] = vscore
matchups['R-Coefficient'] = score_predict
matchups['Historical Home Win Record'] = record_predict

holder = []
logic = []

for i in range (len(matchups)):
    if matchups.iloc[i]['Historical Home Win Record'] > 0.5:
        holder.append(matchups.iloc[i]['Visitor'])
    else:
        holder.append(matchups.iloc[i]['Home'])
            
#Calculate Logic - 1 if correct, 0 if incorrect
    if holder[i] == matchups.iloc[i]['Winner']:
        logic.append(1)
    else:
        logic.append(0)

matchups['Prediction'] = holder
matchups['Logic'] = logic
matchups.to_csv('matchup_output_early.csv')
#matchups prediction accuracy
print((sum(logic) / len(logic)))

0.5633802816901409


## Mirror Modeling (Home / Visitor Teams reversed)

In [195]:
spread_predict = []
record_predict = []
score_predict = []
winner_predict = []
hscore = []
vscore = []

for x in range(len(matchups)):
      
    home = matchups.iloc[x]['Home']
    visitor = matchups.iloc[x]['Visitor']
    game = games

    for i in range(len(teams)):
        if teams.iloc[i]['ABBREVIATION'] == home:
            home_id = teams.iloc[i]['TEAM_ID']

        if teams.iloc[i]['ABBREVIATION'] == visitor:
            visitor_id = teams.iloc[i]['TEAM_ID']

#Narrow games to matched home, visitor teams, create a mirror dataset reversing home / away teams (added data)
    
    game = game[game['HOME_TEAM_ID'] == visitor_id]
    game = game[game['VISITOR_TEAM_ID'] == home_id]
    
    
#Drop dates, team_ids, game_ids, season - will not be useful
    game.drop(['GAME_ID','HOME_TEAM_ID', 
            'VISITOR_TEAM_ID', 'SEASON'], axis=1)
    game.dropna(inplace=True)


#Split Dataset for training, test
    X = game.drop("HOME_TEAM_WINS", axis=1)
    y = game["HOME_TEAM_WINS"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

#Linear Regression 
    model = ElasticNet()
    model_mirror = ElasticNet()
    
# Fitting our model with all of our features in X
    model.fit(X, y)
    
    score = model.score(X, y)
    predict = pd.DataFrame({"Prediction": model.predict(X_test), "Actual": y_test}).reset_index(drop=True)
    contribution = pd.DataFrame({"Parameter": list(X.columns), "Weight": model.coef_ }).reset_index(drop=True)

    
# Calculate Predicted Win%
    wins = 0
    for j in range(len(predict)):
        wins = wins + round(predict.iloc[j]['Prediction'])
    record = wins / len(predict)

# Calculate Spread
    home_score = game['PTS_home'].mean()
    visitor_score = game['PTS_away'].mean()
    spread = home_score - visitor_score

# Write to DataFrame
    spread_predict.append(spread)
    record_predict.append(record)
    score_predict.append(score)
    hscore.append(home_score)
    vscore.append(visitor_score)

## Mirror Populate matchups dataframe, write to CSV

In [196]:
matchups['Home_Score'] = hscore
matchups['Visitor_Score'] = vscore
matchups['R-Coefficient'] = score_predict
matchups['Historical Home Win Record'] = record_predict

holder = []
logic = []

for i in range (len(matchups)):
    if matchups.iloc[i]['Historical Home Win Record'] > 0.5:
        holder.append(matchups.iloc[i]['Visitor'])
    else:
        holder.append(matchups.iloc[i]['Home'])
    #Calculate Logic - 1 if correct, 0 if incorrect
    if holder[i] == matchups.iloc[i]['Winner']:
        logic.append(1)
    else:
        logic.append(0)

matchups['Prediction'] = holder
matchups['Logic'] = logic
matchups.to_csv('matchup_output_early_mirror.csv')
#matchups prediction accuracy
print(sum(logic) / len(logic))

0.5492957746478874
