In [1]:
#Importing Required Libraries:
import pandas as pd
import numpy as np

In [61]:
FiveThirtyEight = pd.read_csv("FiveThirtyEight.csv", index_col = 0)
ESPN = pd.read_csv("ESPN_PastData.csv", index_col = 0)

In [62]:
#Data Cleaning: removing unnecessary columns from the dataframes, removing unused old data, and removing bad rows (with no win probability):
#FiveThirtyEight['year'] = FiveThirtyEight['date'].to_numeric()

#Converting Datatypes of Columns:
FiveThirtyEight.astype({'season': 'int32', 'fivethirtyeight_home_wp': 'float'})
ESPN.astype({'season': 'int32', 'winpb_home': 'float', 'winpb_away': 'float'})
FiveThirtyEight['date'] = pd.to_datetime(FiveThirtyEight['date'])
ESPN['game_date'] = pd.to_datetime(ESPN['game_date'])

#Adding in Columns for Year, Month, Day:
ESPN['year'], ESPN['month'], ESPN['day'] = ESPN['game_date'].dt.year, ESPN['game_date'].dt.month, ESPN['game_date'].dt.day
FiveThirtyEight['year'], FiveThirtyEight['month'], FiveThirtyEight['day'] = FiveThirtyEight['date'].dt.year, FiveThirtyEight['date'].dt.month, FiveThirtyEight['date'].dt.day

#Filtering 538 to only include data of past 5 years:
options = [2018,2019,2020,2021,2022] 
FiveThirtyEight = FiveThirtyEight.loc[FiveThirtyEight['season'].isin(options)] 

In [63]:
#Renaming Columns:
FiveThirtyEight = FiveThirtyEight.rename(columns={"team1": "home_team", "team2": "away_team", 'fivethirtyeight_home_wp' : "538_home_wp"})
ESPN = ESPN.rename(columns={"home_team_abb": "home_team", "away_team_abb": "away_team", "winpb_home": "ESPN_home_wp", "winpb_away": "ESPN_away_wp"})

In [64]:
#Adding Column to 538 Data for away win probabilities:
FiveThirtyEight['538_away_wp'] = 1 - FiveThirtyEight['538_home_wp']

In [65]:
#Converting date column to be in same format as 538 data:
ESPN['date'] = pd.to_datetime(ESPN["game_date"].dt.strftime('%Y-%m-%d'))

In [66]:
#Removing unnecessary columns:
ESPN = ESPN[['date', 'season', 'home_team', 'away_team', 'ESPN_home_wp', 'ESPN_away_wp']]
FiveThirtyEight = FiveThirtyEight[['date', 'season', 'home_team', 'away_team', '538_home_wp', '538_away_wp']]

In [67]:
#Changing Team Names to Match Between Two Dataframes:
teams_dict = {'CHA' : 'CHO', 'PHX' : 'PHO', 'BKN' : 'BRK', 'GS' : 'GSW', 'UTAH' : 'UTA', 'NO' : 'NOP', 'WSH' : 'WAS', 'NY' : 'NYK', 'SA' : 'SAS'}
ESPN = ESPN.replace({'home_team' : teams_dict})
ESPN = ESPN.replace({'away_team': teams_dict})
FiveThirtyEight = FiveThirtyEight.replace({'home_team' : teams_dict})
FiveThirtyEight = FiveThirtyEight.replace({'away_team': teams_dict})

In [None]:
mask = (ESPN['date'] > '2022-11-30') & (ESPN['date'] <= end_date)
ESPN.loc[mask]

In [69]:
ESPN.sort_values(by='date').tail(50)

Unnamed: 0,date,season,home_team,away_team,ESPN_home_wp,ESPN_away_wp
9361,2023-04-05,2023,DET,BRK,-1.0,-1.0
9364,2023-04-05,2023,BOS,TOR,-1.0,-1.0
9363,2023-04-05,2023,ATL,WAS,-1.0,-1.0
9365,2023-04-05,2023,MIL,CHI,-1.0,-1.0
9362,2023-04-05,2023,IND,NYK,-1.0,-1.0
9360,2023-04-05,2023,PHO,SAS,-1.0,-1.0
9355,2023-04-05,2023,HOU,DEN,-1.0,-1.0
9358,2023-04-05,2023,UTA,LAL,-1.0,-1.0
9357,2023-04-05,2023,NOP,SAC,-1.0,-1.0
9356,2023-04-05,2023,MEM,POR,-1.0,-1.0


In [53]:
#Reading in Future ESPN Data:
ESPNFuture = pd.read_csv("ESPN_CurrentGamesWeek.csv", index_col = 0)
ESPNFuture['date'] = pd.to_datetime(ESPNFuture['date'])
ESPNFuture.astype({'season': 'int32', 'wp_home': 'float', 'wp_away': 'float'})
ESPNFuture = ESPNFuture.rename(columns={"home_team_abb": "home_team", "away_team_abb": "away_team", "wp_home": "ESPN_home_wp", "wp_away": "ESPN_away_wp"})
teams_dict = {'CHA' : 'CHO', 'PHX' : 'PHO', 'BKN' : 'BRK', 'GS' : 'GSW', 'UTAH' : 'UTA', 'NO' : 'NOP', 'WSH' : 'WAS', 'NY' : 'NYK', 'SA' : 'SAS'}
ESPNFuture = ESPNFuture.replace({'home_team' : teams_dict})
ESPNFuture = ESPNFuture.replace({'away_team': teams_dict})
ESPNFuture = ESPNFuture[['date', 'season', 'home_team', 'away_team', 'ESPN_home_wp', 'ESPN_away_wp']]

#Combining Future ESPN Data with ESPN Past Data:
ESPN = ESPN.append(ESPNFuture)

In [54]:
# Filtering out bad data from ESPN:
ESPN = ESPN[ESPN['ESPN_home_wp'] != -1] 

In [60]:
ESPN.sort_values(by='date').tail(90)

Unnamed: 0,date,season,home_team,away_team,ESPN_home_wp,ESPN_away_wp
31,2022-11-18,2023,UTA,PHO,0.598,0.402
30,2022-11-18,2023,NOP,BOS,0.440,0.560
29,2022-11-18,2023,DAL,DEN,0.731,0.269
28,2022-11-18,2023,MEM,OKC,0.796,0.204
27,2022-11-18,2023,HOU,IND,0.403,0.597
...,...,...,...,...,...,...
6836,2022-11-27,2023,HOU,OKC,0.486,0.514
6841,2022-11-27,2023,LAC,IND,0.467,0.533
6847,2022-11-28,2023,MIL,DAL,0.339,0.661
6849,2022-11-29,2023,WAS,MIN,0.429,0.571


In [59]:
ESPNFuture

Unnamed: 0,date,season,home_team,away_team,ESPN_home_wp,ESPN_away_wp
1,2022-11-14,2023,DET,TOR,0.432,0.568
2,2022-11-14,2023,ORL,CHO,0.403,0.597
3,2022-11-14,2023,BOS,OKC,0.894,0.106
4,2022-11-14,2023,MIA,PHO,0.598,0.402
5,2022-11-14,2023,HOU,LAC,0.385,0.615
6,2022-11-14,2023,MIL,ATL,0.444,0.556
7,2022-11-14,2023,GSW,SAS,0.715,0.285
8,2022-11-15,2023,NOP,MEM,0.537,0.463
9,2022-11-15,2023,DAL,LAC,0.755,0.245
10,2022-11-15,2023,UTA,NYK,0.729,0.271


In [11]:
#New Dataframe that appends two datasets:
combined = pd.merge(FiveThirtyEight, ESPN, on=['date', 'season', 'home_team', 'away_team'])
combined

Unnamed: 0,date,season,home_team,away_team,538_home_wp,538_away_wp,ESPN_home_wp,ESPN_away_wp
0,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510
1,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510
2,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490
3,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490
4,2017-10-18,2018,ORL,MIA,0.410901,0.589099,0.510,0.490
...,...,...,...,...,...,...,...,...
1514,2022-05-10,2022,MIA,PHI,0.637228,0.362772,0.360,0.640
1515,2022-05-11,2022,BOS,MIL,0.772656,0.227344,0.183,0.817
1516,2022-05-12,2022,PHI,MIA,0.598284,0.401716,0.608,0.392
1517,2022-05-13,2022,MIL,BOS,0.511938,0.488062,0.541,0.459


In [12]:
combined['model_home_wp'] = (combined['ESPN_home_wp'] + combined['538_home_wp'])/2
combined['model_away_wp'] = (combined['ESPN_away_wp'] + combined['538_away_wp'])/2
combined

Unnamed: 0,date,season,home_team,away_team,538_home_wp,538_away_wp,ESPN_home_wp,ESPN_away_wp,model_home_wp,model_away_wp
0,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510,0.664241,0.335759
1,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510,0.664241,0.335759
2,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490,0.563911,0.436089
3,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490,0.563911,0.436089
4,2017-10-18,2018,ORL,MIA,0.410901,0.589099,0.510,0.490,0.460451,0.539549
...,...,...,...,...,...,...,...,...,...,...
1514,2022-05-10,2022,MIA,PHI,0.637228,0.362772,0.360,0.640,0.498614,0.501386
1515,2022-05-11,2022,BOS,MIL,0.772656,0.227344,0.183,0.817,0.477828,0.522172
1516,2022-05-12,2022,PHI,MIA,0.598284,0.401716,0.608,0.392,0.603142,0.396858
1517,2022-05-13,2022,MIL,BOS,0.511938,0.488062,0.541,0.459,0.526469,0.473531


In [13]:
def Outcome(df):
    if (df['score1'] > df['score2']):
        return df['team1']
    elif (df['score2'] > df['score1']):
        return df['team2']
    elif (df['score2'] == df['score1']):
        return 'Tie'
    else:
        return "NA"

In [14]:
#Adding in Game Outcomes
FiveThirtyEightOriginalData = pd.read_csv("https://projects.fivethirtyeight.com/nba-model/nba_elo.csv")
FiveThirtyEightOriginalData['Outcome'] = FiveThirtyEightOriginalData.apply(Outcome, axis = 1)
FiveThirtyEightOriginalData = FiveThirtyEightOriginalData[['date', 'season', 'team1', 'team2', 'Outcome']]
FiveThirtyEightOriginalData = FiveThirtyEightOriginalData.rename(columns={"team1": "home_team", "team2": "away_team"})
FiveThirtyEightOriginalData['date'] = pd.to_datetime(FiveThirtyEightOriginalData['date'])
combined = pd.merge(combined, FiveThirtyEightOriginalData, on=['date', 'season', 'home_team', 'away_team'])

In [15]:
def ModelOutcome(df):
    if (df['model_home_wp'] > df['model_away_wp']):
        return df['home_team']
    elif (df['model_away_wp'] > df['model_home_wp']):
        return df['away_team']
    elif (df['model_away_wp'] == df['model_home_wp']):
        return 'Tie'
    else:
        return "NA"

In [16]:
combined['ModelPredOutcome'] = combined.apply(ModelOutcome, axis = 1)
combined

Unnamed: 0,date,season,home_team,away_team,538_home_wp,538_away_wp,ESPN_home_wp,ESPN_away_wp,model_home_wp,model_away_wp,Outcome,ModelPredOutcome
0,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510,0.664241,0.335759,WAS,WAS
1,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510,0.664241,0.335759,WAS,WAS
2,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490,0.563911,0.436089,DET,DET
3,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490,0.563911,0.436089,DET,DET
4,2017-10-18,2018,ORL,MIA,0.410901,0.589099,0.510,0.490,0.460451,0.539549,ORL,MIA
...,...,...,...,...,...,...,...,...,...,...,...,...
1514,2022-05-10,2022,MIA,PHI,0.637228,0.362772,0.360,0.640,0.498614,0.501386,MIA,PHI
1515,2022-05-11,2022,BOS,MIL,0.772656,0.227344,0.183,0.817,0.477828,0.522172,MIL,MIL
1516,2022-05-12,2022,PHI,MIA,0.598284,0.401716,0.608,0.392,0.603142,0.396858,MIA,PHI
1517,2022-05-13,2022,MIL,BOS,0.511938,0.488062,0.541,0.459,0.526469,0.473531,BOS,MIL


In [17]:
combined['Accurate?'] = combined['Outcome'] == combined['ModelPredOutcome']
combined

Unnamed: 0,date,season,home_team,away_team,538_home_wp,538_away_wp,ESPN_home_wp,ESPN_away_wp,model_home_wp,model_away_wp,Outcome,ModelPredOutcome,Accurate?
0,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510,0.664241,0.335759,WAS,WAS,True
1,2017-10-18,2018,WAS,PHI,0.838481,0.161519,0.490,0.510,0.664241,0.335759,WAS,WAS,True
2,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490,0.563911,0.436089,DET,DET,True
3,2017-10-18,2018,DET,CHO,0.617821,0.382179,0.510,0.490,0.563911,0.436089,DET,DET,True
4,2017-10-18,2018,ORL,MIA,0.410901,0.589099,0.510,0.490,0.460451,0.539549,ORL,MIA,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1514,2022-05-10,2022,MIA,PHI,0.637228,0.362772,0.360,0.640,0.498614,0.501386,MIA,PHI,False
1515,2022-05-11,2022,BOS,MIL,0.772656,0.227344,0.183,0.817,0.477828,0.522172,MIL,MIL,True
1516,2022-05-12,2022,PHI,MIA,0.598284,0.401716,0.608,0.392,0.603142,0.396858,MIA,PHI,False
1517,2022-05-13,2022,MIL,BOS,0.511938,0.488062,0.541,0.459,0.526469,0.473531,BOS,MIL,False


In [15]:
list1 = FiveThirtyEight['home_team'].unique().tolist()

In [16]:
list2 = ESPN['home_team'].unique().tolist()

In [17]:
inESPNbutNot538 = list(set(list2) - set(list1))

In [18]:
inESPNbutNot538

['USA']

In [19]:
in538butNotESPN = list(set(list1) - set(list2))

In [20]:
in538butNotESPN

[]

In [13]:
combined['home_team'].unique().shape

NameError: name 'combined' is not defined

In [14]:
ESPN['home_team'].unique().shape

(31,)

In [12]:
FiveThirtyEight['home_team'].unique().shape

(30,)

In [79]:
#New Dataframe that appends two datasets:
combined = pd.merge(ESPN, FiveThirtyEight, on=['season', 'home_team', 'away_team'])
combined

Unnamed: 0,date_x,season,home_team,away_team,ESPN_home_wp,ESPN_away_wp,date_y,538_home_wp,538_away_wp
0,2022-01-01,2022,MIL,NOP,0.778,0.222,2022-01-01,0.812719,0.187281
1,2022-01-02,2022,DET,SAS,0.225,0.775,2022-01-01,0.314551,0.685449
2,2022-01-02,2022,WAS,CHI,0.580,0.420,2022-01-01,0.495383,0.504617
3,2022-01-02,2022,WAS,CHI,0.580,0.420,2022-03-29,0.493970,0.506030
4,2022-03-29,2022,WAS,CHI,0.582,0.418,2022-01-01,0.495383,0.504617
...,...,...,...,...,...,...,...,...,...
8284,2018-11-07,2019,CHA,ATL,0.864,0.136,2018-11-06,0.818829,0.181171
8285,2018-11-07,2019,CHA,ATL,0.864,0.136,2018-11-28,0.852149,0.147851
8286,2018-11-07,2019,DAL,WAS,0.605,0.395,2018-11-06,0.513461,0.486539
8287,2018-11-07,2019,PHX,BKN,0.479,0.521,2018-11-06,0.435605,0.564395


In [57]:
 %run modelDataFrameBuilder.py