In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

pd.options.display.max_rows = 100

In [2]:
data = pd.read_csv('all_teams.csv')
data = data[data['situation'] == 'all']

In [3]:
#just look at the regular season of one year
specific = data.loc[(data['playoffGame'] == 0)]
#select only the data we care about
specific = specific[['team','opposingTeam','gameId','gameDate','season','home_or_away','xGoalsFor','xGoalsAgainst','goalsFor','goalsAgainst','hitsFor','hitsAgainst']]
#determing win vs loss
specific['win'] = data['goalsFor'] > data['goalsAgainst']
#sort the df by team and then by date
specific = specific.sort_values(['team','gameDate']).reset_index(drop=True)

#how many previous games to calculate the avg from
avgSize = 5

specific[['season','gameId','team','gameDate','goalsFor','goalsAgainst','win']].head(20)

Unnamed: 0,season,gameId,team,gameDate,goalsFor,goalsAgainst,win
0,2008,2008020008,ANA,20081009,1.0,4.0,False
1,2008,2008020030,ANA,20081012,2.0,4.0,False
2,2008,2008020042,ANA,20081014,3.0,6.0,False
3,2008,2008020048,ANA,20081015,2.0,3.0,False
4,2008,2008020061,ANA,20081017,4.0,0.0,True
5,2008,2008020077,ANA,20081019,1.0,3.0,False
6,2008,2008020083,ANA,20081021,2.0,2.0,False
7,2008,2008020099,ANA,20081024,4.0,3.0,True
8,2008,2008020107,ANA,20081025,6.0,4.0,True
9,2008,2008020122,ANA,20081027,3.0,2.0,True


In [4]:
# create new avg metrics
metrics = ['daysSinceLast','points','winStreak','avgGF','avgGA','avgxGF','avgxGA','avgHF','avgHA']
avgMetrics = [('goalsFor','avgGF'),('goalsAgainst','avgGA'),('xGoalsFor','avgxGF'),('xGoalsAgainst','avgxGA'),('hitsFor','avgHF'),('hitsAgainst','avgHA')]

for metric in metrics: 
    specific[metric] = np.zeros(len(specific))

In [5]:
#create game numbers
specific.loc[0,'game'] = 1
for i in range(0,len(specific)-1):
    if (specific.loc[i,'gameDate'] > specific.loc[i+1,'gameDate'] or specific.loc[i,'season'] != specific.loc[i+1,'season']):
        #reset the start of each season game number to 1
        specific.loc[i+1,'game'] = 1
    else:
        #game number (for simplicity purposes)
        specific.loc[i+1,'game'] = 1+ specific.loc[i,'game']

In [6]:
# fill the new metrics based on previous game data
for i in range(0,len(specific)-1):
    if (specific.loc[i,'game'] > specific.loc[i+1,'game']):
        #reset each stat to 0 at the start of each team's data
        for metric in metrics:
            specific.loc[i+1,metric] = 0
    else:
        #days since last game
        lastGame = pd.to_datetime(specific.loc[i,'gameDate'], format='%Y%m%d')
        thisGame = pd.to_datetime(specific.loc[i+1,'gameDate'], format='%Y%m%d')
        specific.loc[i+1,'daysSinceLast'] = (thisGame - lastGame).days
        #points
        specific.loc[i+1,'points'] += (2*specific.loc[i,'win'] + specific.loc[i,'points'])
        #winstreak
        if specific.loc[i,'win']:
            specific.loc[i+1,'winStreak'] = specific.loc[i,'winStreak'] + 1
        else:
            specific.loc[i+1,'winStreak'] = 0
            
        #compute all the average metrics using the list
        #avgMetrics = ['avgGF','avgGA','avgxGF','avgxGA','avgHF','avgHA']
        for metric,avgMetric in avgMetrics:
            value = 0
            #only look at games after avgSize games (i.e. if avgSize = 10, then only look at games after 10)
            if specific.loc[i+1,'game'] > avgSize:
                start,end = i+1 - avgSize, i+1
                for gameIndex in range(start,end):
                    value += specific.loc[gameIndex,metric]
                specific.loc[i+1,avgMetric] = value / avgSize
            else:
                specific.loc[i+1,avgMetric] = np.nan

In [7]:
#fix an error that the first game data would be all zeros
#specific = specific.loc[specific['game'] != 1].reset_index(drop=True)
specific

Unnamed: 0,team,opposingTeam,gameId,gameDate,season,home_or_away,xGoalsFor,xGoalsAgainst,goalsFor,goalsAgainst,...,daysSinceLast,points,winStreak,avgGF,avgGA,avgxGF,avgxGA,avgHF,avgHA,game
0,ANA,S.J,2008020008,20081009,2008,AWAY,1.271,3.628,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,1.0
1,ANA,ARI,2008020030,20081012,2008,HOME,2.298,1.895,2.0,4.0,...,3.0,0.0,0.0,,,,,,,2.0
2,ANA,L.A,2008020042,20081014,2008,AWAY,1.573,3.709,3.0,6.0,...,2.0,0.0,0.0,,,,,,,3.0
3,ANA,EDM,2008020048,20081015,2008,HOME,2.219,2.177,2.0,3.0,...,1.0,0.0,0.0,,,,,,,4.0
4,ANA,S.J,2008020061,20081017,2008,HOME,2.150,2.670,4.0,0.0,...,2.0,0.0,0.0,,,,,,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34371,WSH,VGK,2022020745,20230121,2022,AWAY,1.288,2.216,2.0,6.0,...,2.0,50.0,1.0,2.8,3.0,3.5660,2.8118,29.8,30.0,49.0
34372,WSH,COL,2022020764,20230124,2022,AWAY,3.247,2.006,2.0,3.0,...,3.0,50.0,0.0,2.6,3.2,3.3702,2.3110,32.2,29.8,50.0
34373,WSH,PIT,2022020773,20230126,2022,HOME,4.596,2.129,2.0,2.0,...,2.0,50.0,0.0,2.8,3.2,3.1564,2.1664,30.2,27.2,51.0
34374,WSH,TOR,2022020800,20230129,2022,AWAY,2.635,3.703,1.0,5.0,...,3.0,50.0,0.0,2.4,3.0,3.3898,2.0624,26.6,24.2,52.0


In [94]:
#

In [15]:
#avg point columns
specific['avgPPG'] = np.zeros(len(specific))

for i in range(0,len(specific)-1):
    if (specific.loc[i,'game'] > specific.loc[i+1,'game']):
        #reset avgPPG to 0 at the start of each team's data
        specific.loc[i+1,'avgPPG'] = 0
    else:
        totalPointsInSet = 0
        if specific.loc[i+1,'game'] > avgSize: 
            start,end = i+1 - avgSize,i+1
            for gameIndex in range(start,end):
                totalPointsInSet += 2*specific.loc[gameIndex,'win']
            specific.loc[i+1,'avgPPG'] = totalPointsInSet / avgSize
        else:
            specific.loc[i+1,avgMetric] = np.nan    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific['avgPPG'] = np.zeros(len(specific))


KeyError: 1

In [14]:
differentials = [('avgGoalDif','avgGF','avgGA'),('avgXGoalDif','avgxGF','avgxGA')]
for dif,For,Against in differentials:
    specific[dif] = specific[For] - specific[Against]
#specific

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific[dif] = specific[For] - specific[Against]


In [28]:
matchups = specific[['gameId','gameDate','game','team','home_or_away','avgGoalDif','avgXGoalDif','avgHF','points','winStreak','avgPPG','daysSinceLast','goalsFor']]
matchupMetrics = ['home_or_away','avgGoalDif','avgXGoalDif','avgHF','points','winStreak','avgPPG','daysSinceLast','goalsFor']

# def matchup(gameId):
#     #return matchups[['gameDate','team','home_or_away','avgGoalDif','avgXGoalDif','avgHF','points','winStreak','avgPPG','daysSinceLast','goalsFor']][gameId]   
#     analysis = {}
#     for metric in matchupMetrics:
#         analysis[metric] = (matchups.loc[gameId,metric].tolist())
#     return analysis
oneTeam = specific[['gameId','gameDate','game','team','home_or_away','avgGoalDif','avgXGoalDif','avgHF','points','winStreak','avgPPG','daysSinceLast','goalsFor','goalsAgainst']].set_index(['gameId'])

Wall time: 62 ms


{'home_or_away': ['AWAY', 'HOME'],
 'avgGoalDif': [-2.3999999999999995, 0.20000000000000018],
 'avgXGoalDif': [-0.6018000000000003, -0.5234000000000001],
 'avgHF': [16.4, 15.2],
 'points': [2.0, 8.0],
 'winStreak': [0.0, 0.0],
 'avgPPG': [0.0, 0.4],
 'daysSinceLast': [2.0, 3.0],
 'goalsFor': [1.0, 6.0]}

In [31]:
oneTeam['Win'] = oneTeam['goalsFor'] > oneTeam['goalsAgainst']
del oneTeam['goalsFor'],oneTeam['goalsAgainst'],oneTeam['gameDate'],oneTeam['game'],oneTeam['points']

KeyError: 'goalsFor'

In [39]:
#matchups.to_csv('cleanedNHLData.csv')
oneTeam = oneTeam.dropna()
#del oneTeam['team']
oneTeam['home'] = oneTeam['home_or_away'] == 'HOME'
oneTeam.to_csv('oneTeamData.csv')

In [40]:
mlData = matchups.drop('gameDate',axis=1).drop('team',axis=1).drop('points',axis=1)

mlData = mlData.sort_values(['gameId','home_or_away'],ascending=False)
mlData = mlData.loc[mlData['game']!=1].dropna().drop('game',axis=1)
mlHome = mlData.loc[mlData['home_or_away'] == 'HOME'].drop('home_or_away',axis=1)
mlAway = mlData.loc[mlData['home_or_away'] == 'AWAY'].drop('home_or_away',axis=1)

mlData = (mlHome - mlAway).dropna()
#mlData['WinLoss'] = (mlData['goalsFor'] > 0)
mlData.tail(150)

Unnamed: 0,gameId,avgGoalDif,avgXGoalDif,avgHF,winStreak,avgPPG,daysSinceLast,goalsFor


In [16]:
#mlData.to_csv('mlData.csv')

In [194]:
matchups.loc[2011021070]

Unnamed: 0_level_0,gameDate,game,team,home_or_away,avgGoalDif,avgXGoalDif,avgHF,points,winStreak,avgPPG,daysSinceLast,goalsFor
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011021070,20120317,74.0,COL,AWAY,0.375,0.087125,21.25,58.0,0.0,0.75,2.0,3.0
2011021070,20120317,71.0,NYR,HOME,-0.875,-0.526375,28.5,82.0,0.0,0.75,2.0,1.0


In [195]:
mlData.loc[2011021070]

avgGoalDif      -1.2500
avgXGoalDif     -0.6135
avgHF            7.2500
winStreak        0.0000
avgPPG           0.0000
daysSinceLast    0.0000
goalsFor        -2.0000
Name: 2011021070, dtype: float64