In [5]:
import numpy as np
import pandas as pd 



In [6]:
# Helper functions
# This will balance the prior of preseason rank with the actual win percentage

lower_bound_week = 5
upper_bound_week = 9
def get_team_ranking(x, home_away):
    if x['schedule_week'] < lower_bound_week:
        return x[f'Predictive_{home_away}']
    elif lower_bound_week <= x['schedule_week'] <= upper_bound_week:
        return x[f'Predictive_{home_away}'] * (1-((x['schedule_week']-(lower_bound_week-1))/(upper_bound_week-lower_bound_week+1))) + x[f'rank_winpct_{home_away}'] * ((x['schedule_week']-(lower_bound_week-1))/(upper_bound_week-lower_bound_week+1))
    else:
        return x[f'rank_winpct_{home_away}']

### Obtain standings data for every week between 2012 and now

In [295]:
def clean_pfr_tables(input_df, afc_or_nfc):
    
    output_df = input_df.loc[~input_df.Tm.str.contains(afc_or_nfc)]
    output_df.Tm = output_df.Tm.str.replace('[^ 0-9a-zA-Z]+', '' ,regex=True)
    return output_df
    
def get_weekly_data(my_url):
    week_number = re.findall("(?:week=)(\d+)", my_url)[0]
    year_number = re.findall("(?:year=)(\d+)", my_url)[0]
    tables = pd.read_html(my_url)
    afc = tables[0]
    nfc = tables[1]
    
    cleaned_afc = clean_pfr_tables(afc, "AFC")
    cleaned_nfc = clean_pfr_tables(nfc, "NFC")
    
    combined_week = pd.concat([cleaned_afc, cleaned_nfc])
    
    # The standings are as of the END of the week, so we want to add one to the week to get it to line up with our schedule
    combined_week['week_number'] = int(week_number) + 1    
    combined_week['year_number'] = year_number
    
    combined_week = combined_week.reset_index(drop=True)
    
    return combined_week

weeks = list(range(1,19))
years = list(range(2012,2022))
df_list = []

#for year in years:
#    for week in weeks:
#        df_list.append(get_weekly_data(f'https://www.pro-football-reference.com/boxscores/standings.cgi?week={week}&year={year}&wk_league=NFL'))
        
#full_standings_data = pd.concat(df_list, axis=0, ignore_index = True)
#full_standings_data.to_csv("./full_standings_data.csv")

In [7]:
# Read already generated standings data

full_standings_data = pd.read_csv("./full_standings_data.csv").drop("Unnamed: 0", axis = 1)

full_standings_data[['Pts', 'PtsO', 'W', 'L', 'T', 'year_number', 'week_number']] = full_standings_data[['Pts', 'PtsO', 'W', 'L', 'T', 'year_number', 'week_number']].apply(pd.to_numeric, errors = 'coerce', axis=1)

full_standings_data['off_ppg'] = full_standings_data.Pts / (full_standings_data['W'] + full_standings_data['L'] + 
                                                                              full_standings_data['T'])

full_standings_data['def_ppg'] = full_standings_data.PtsO / (full_standings_data['W'] + full_standings_data['L'] + 
                                                                              full_standings_data['T'])

full_standings_data['rank_offense'] = full_standings_data.groupby(['year_number', 'week_number'])['off_ppg'].rank(method = 'first', ascending = False)
full_standings_data['rank_defense'] = full_standings_data.groupby(['year_number', 'week_number'])['def_ppg'].rank(method = 'first', ascending = True)

full_standings_data['rank_winpct'] = full_standings_data.groupby(['year_number', 'week_number'])['W-L%'].rank(method = 'first', ascending = False)


full_standings_data.head()

FileNotFoundError: [Errno 2] No such file or directory: './full_standings_data.csv'

Below is the code used to obtain preseason ratings data for every year between 2012 and now

In [297]:
def get_preseason_rankings(my_url, year):
    standings_data = pd.read_html(my_url)
    current_year = standings_data[0]
    current_year['season'] = year
    return current_year

#preseason_rankings_list = []
#years = list(range(2012,2022))

#for year in years:
#    preseason_rankings_list.append(get_preseason_rankings(my_url = f"https://www.teamrankings.com/nfl/rankings/teams/?date={year}-08-30", year = year))
    
#preseason_rankings_df = pd.concat(preseason_rankings_list, axis=0, ignore_index = True)
#preseason_rankings_df.to_csv("./preseason_rankings_df.csv")

In [298]:
# Read already generated preseason_ratings data

preseason_rankings_df = pd.read_csv("./preseason_rankings_df.csv")

In [5]:
teams = pd.read_csv('../input/nfl-scores-and-betting-data/nfl_teams.csv')
teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_id_pfr,team_conference,team_division,team_conference_pre2002,team_division_pre2002
0,Arizona Cardinals,Cardinals,ARI,CRD,NFC,NFC West,NFC,NFC West
1,Phoenix Cardinals,Cardinals,ARI,CRD,NFC,,NFC,NFC East
2,St. Louis Cardinals,Cardinals,ARI,ARI,NFC,,NFC,NFC East
3,Atlanta Falcons,Falcons,ATL,ATL,NFC,NFC South,NFC,NFC West
4,Baltimore Ravens,Ravens,BAL,RAV,AFC,AFC North,AFC,AFC Central


In [300]:
scores = pd.read_csv('../input/nfl-scores-and-betting-data/spreadspoke_scores.csv')
scores['schedule_week'] = pd.to_numeric(scores.schedule_week, errors = "coerce")
scores = scores.dropna(subset = ['schedule_week'])

# filter scores data
recent_scores = scores[scores["schedule_season"].isin(list(range(2012, 2022)))]

recent_scores.loc[:,'margin_of_victory'] = abs(recent_scores.loc[:,'score_home'] - recent_scores.loc[:,'score_away'])

recent_scores.loc[:, 'blowout'] = recent_scores.loc[:, 'margin_of_victory'] > 14

recent_scores.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,margin_of_victory,blowout
10542,9/5/2012,2012,1.0,False,New York Giants,17.0,24.0,Dallas Cowboys,NYG,-4.0,46.0,MetLife Stadium,False,77.0,5.0,63.0,,7.0,False
10543,9/9/2012,2012,1.0,False,Arizona Cardinals,20.0,16.0,Seattle Seahawks,SEA,-3.0,41.0,University of Phoenix Stadium,False,72.0,0.0,,DOME,4.0,False
10544,9/9/2012,2012,1.0,False,Chicago Bears,41.0,21.0,Indianapolis Colts,CHI,-10.0,42.5,Soldier Field,False,70.0,11.0,66.0,,20.0,True
10545,9/9/2012,2012,1.0,False,Cleveland Browns,16.0,17.0,Philadelphia Eagles,PHI,-9.5,42.0,FirstEnergy Stadium,False,68.0,9.0,57.0,,1.0,False
10546,9/9/2012,2012,1.0,False,Denver Broncos,31.0,19.0,Pittsburgh Steelers,DEN,-2.0,45.5,Sports Authority Field at Mile High,False,85.0,9.0,12.0,,12.0,False


In [301]:


scores_and_standings = pd.merge(recent_scores, full_standings_data[['Tm', 'W', 'L', 'T', 'Pts', 'PtsO', 'week_number', 'year_number', 'off_ppg', 'def_ppg', 'rank_offense',
       'rank_defense', 'rank_winpct']].add_suffix("_home"), 
                                how = "left", left_on = ['schedule_season', 'schedule_week', 'team_home'], 
                                right_on=['year_number_home', 'week_number_home', 'Tm_home'])

scores_and_standings_all = pd.merge(scores_and_standings, full_standings_data[['Tm', 'W', 'L', 'T', 'Pts', 'PtsO', 'week_number', 'year_number', 'off_ppg', 'def_ppg', 'rank_offense',
       'rank_defense', 'rank_winpct']].add_suffix("_away"), 
                                how = "left", left_on = ['schedule_season', 'schedule_week', 'team_away'], 
                                right_on=['year_number_away', 'week_number_away', 'Tm_away'])

scores_and_standings_all.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,T_away,Pts_away,PtsO_away,week_number_away,year_number_away,off_ppg_away,def_ppg_away,rank_offense_away,rank_defense_away,rank_winpct_away
0,9/5/2012,2012,1.0,False,New York Giants,17.0,24.0,Dallas Cowboys,NYG,-4.0,...,,,,,,,,,,
1,9/9/2012,2012,1.0,False,Arizona Cardinals,20.0,16.0,Seattle Seahawks,SEA,-3.0,...,,,,,,,,,,
2,9/9/2012,2012,1.0,False,Chicago Bears,41.0,21.0,Indianapolis Colts,CHI,-10.0,...,,,,,,,,,,
3,9/9/2012,2012,1.0,False,Cleveland Browns,16.0,17.0,Philadelphia Eagles,PHI,-9.5,...,,,,,,,,,,
4,9/9/2012,2012,1.0,False,Denver Broncos,31.0,19.0,Pittsburgh Steelers,DEN,-2.0,...,,,,,,,,,,


In [302]:
scores_and_rankings = pd.merge(scores_and_standings_all, preseason_rankings_df.add_suffix("_home"), 
                                how = "left", left_on = ['schedule_season', 'team_home'], 
                                right_on=['season_home', 'Team_home'])

scores_and_rankings_all = pd.merge(scores_and_rankings, preseason_rankings_df.add_suffix("_away"), 
                                how = "left", left_on = ['schedule_season', 'team_away'], 
                                right_on=['season_away', 'Team_away'])

scores_and_rankings_all.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,season_home,Unnamed: 0_away,Team_away,Predictive_away,Home_away,Away_away,Last 5_away,In Div._away,SOS_away,season_away
0,9/5/2012,2012,1.0,False,New York Giants,17.0,24.0,Dallas Cowboys,NYG,-4.0,...,2012.0,19.0,Dallas Cowboys,20.0,20.0,20.0,19.0,19.0,20.0,2012.0
1,9/9/2012,2012,1.0,False,Arizona Cardinals,20.0,16.0,Seattle Seahawks,SEA,-3.0,...,2012.0,26.0,Seattle Seahawks,27.0,27.0,27.0,12.0,12.0,27.0,2012.0
2,9/9/2012,2012,1.0,False,Chicago Bears,41.0,21.0,Indianapolis Colts,CHI,-10.0,...,2012.0,20.0,Indianapolis Colts,21.0,21.0,21.0,13.0,13.0,21.0,2012.0
3,9/9/2012,2012,1.0,False,Cleveland Browns,16.0,17.0,Philadelphia Eagles,PHI,-9.5,...,2012.0,3.0,Philadelphia Eagles,4.0,4.0,4.0,8.0,8.0,4.0,2012.0
4,9/9/2012,2012,1.0,False,Denver Broncos,31.0,19.0,Pittsburgh Steelers,DEN,-2.0,...,2012.0,1.0,Pittsburgh Steelers,2.0,2.0,2.0,14.0,14.0,2.0,2012.0


In [4]:
# assumptions
bad_team_rank = 25
good_offense_rank = 8
good_defense_rank = 8
lopsided_threshold = 12

# feature engineering

# Is the team bad? 
scores_and_rankings_all.loc[:,'team_home_ranking'] = scores_and_rankings_all.apply(lambda x: get_team_ranking(x, 'home'), axis = 1)
scores_and_rankings_all.loc[:,'team_away_ranking'] = scores_and_rankings_all.apply(lambda x: get_team_ranking(x, 'away'), axis = 1)
scores_and_rankings_all.loc[:,'bad_team_involved'] = scores_and_rankings_all.apply(lambda x: True if x['team_away_ranking'] >= bad_team_rank or x['team_home_ranking'] >= bad_team_rank else False, axis = 1)

# Singular high-powered offense (if both are high, False. If one is high and defense is elite, False)
scores_and_rankings_all.loc[:, 'high_powered_offense'] = scores_and_rankings_all.apply(lambda x: True if (x['rank_offense_home'] <= good_offense_rank and x['rank_offense_away'] > good_offense_rank and x['rank_defense_away'] > good_defense_rank) or (x['rank_offense_away'] <= good_offense_rank and x['rank_offense_home'] > good_offense_rank and x['rank_defense_home'] > good_defense_rank) else False, axis = 1)

# Lopsided matchup?
scores_and_rankings_all.loc[:, 'lopsided_matchup'] = scores_and_rankings_all.apply(lambda x: True if abs(x['team_home_ranking'] - x['team_away_ranking']) >= lopsided_threshold else False, axis = 1)

# bet?
scores_and_rankings_all.loc[:, 'bet_the_game'] = scores_and_rankings_all.apply(lambda x: True if x['bad_team_involved'] == False and x['high_powered_offense'] == False and x['lopsided_matchup'] == False else False, axis = 1)

# Did we win the bet?
scores_and_rankings_all.loc[:, 'win_bet'] = scores_and_rankings_all.apply(lambda x: True if x['bet_the_game'] == True and x['blowout'] == False else False, axis = 1)
scores_and_rankings_all[scores_and_rankings_all['bet_the_game'] == True].win_bet.value_counts(normalize=True)

NameError: name 'scores_and_rankings_all' is not defined

In [3]:
scores_and_rankings_all[scores_and_rankings_all['schedule_season'] == 2021].head(10)

NameError: name 'scores_and_rankings_all' is not defined

In [306]:
scores_and_rankings_all.to_csv("./second_pass_output.csv")

In [2]:
# Exploratory Analysis

scores_and_rankings_all.select_dtypes

NameError: name 'scores_and_rankings_all' is not defined

In [1]:
# Trying unsupervised methods to group games and look at mix of blowouts/non-blowouts

from sklearn.cluster import KMeans

my_kmeans = KMeans()



In [None]:
# TODO
# How to determine if offense/defense is good on early weeks? Need preseason offense/defense ratings, 
### cannot use preseason
# Add player data
# Consider the threshold games...difference between 14 and 15 is win/loss

In [None]:
# Old code
# Is the team bad? Preseason ranking used before week 8, otherwise, use the win percentage rank

#scores_and_rankings_all['team_home_ranking_old'] = scores_and_rankings_all['Predictive_home'].where(
#    scores_and_rankings_all['schedule_week'] <= 7, scores_and_rankings_all['rank_winpct_home']
#)

#scores_and_rankings_all['team_away_ranking'] = scores_and_rankings_all['Predictive_away'].where(
#    scores_and_rankings_all['schedule_week'] <= 7, scores_and_rankings_all['rank_winpct_away']
#)