In [2]:
import sys
!{sys.executable} -m pip install sportsipy

Defaulting to user installation because normal site-packages is not writeable


In [3]:
from sportsipy.nfl.boxscore import Boxscores, Boxscore
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# sklearn utilities
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing

# sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [5]:
def game_data(game_df,game_stats):
    try:
        away_team_df = game_df[['away_name', 'away_abbr', 'away_score']].rename(columns = {'away_name': 'team_name', 'away_abbr': 'team_abbr', 'away_score': 'score'})
        home_team_df = game_df[['home_name','home_abbr', 'home_score']].rename(columns = {'home_name': 'team_name', 'home_abbr': 'team_abbr', 'home_score': 'score'})
        try:
            if game_df.loc[0,'away_score'] > game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
            elif game_df.loc[0,'away_score'] < game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
            else: 
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
        except TypeError:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)        

        away_stats_df = game_stats.dataframe[['away_first_downs', 'away_fourth_down_attempts',
               'away_fourth_down_conversions', 'away_fumbles', 'away_fumbles_lost',
               'away_interceptions', 'away_net_pass_yards', 'away_pass_attempts',
               'away_pass_completions', 'away_pass_touchdowns', 'away_pass_yards',
               'away_penalties', 'away_points', 'away_rush_attempts',
               'away_rush_touchdowns', 'away_rush_yards', 'away_third_down_attempts',
               'away_third_down_conversions', 'away_time_of_possession',
               'away_times_sacked', 'away_total_yards', 'away_turnovers',
               'away_yards_from_penalties', 'away_yards_lost_from_sacks']].reset_index().drop(columns ='index').rename(columns = {
               'away_first_downs': 'first_downs', 'away_fourth_down_attempts':'fourth_down_attempts',
               'away_fourth_down_conversions':'fourth_down_conversions' , 'away_fumbles': 'fumbles', 'away_fumbles_lost': 'fumbles_lost',
               'away_interceptions': 'interceptions', 'away_net_pass_yards':'net_pass_yards' , 'away_pass_attempts': 'pass_attempts',
               'away_pass_completions':'pass_completions' , 'away_pass_touchdowns': 'pass_touchdowns', 'away_pass_yards': 'pass_yards',
               'away_penalties': 'penalties', 'away_points': 'points', 'away_rush_attempts': 'rush_attempts',
               'away_rush_touchdowns': 'rush_touchdowns', 'away_rush_yards': 'rush_yards', 'away_third_down_attempts': 'third_down_attempts',
               'away_third_down_conversions': 'third_down_conversions', 'away_time_of_possession': 'time_of_possession',
               'away_times_sacked': 'times_sacked', 'away_total_yards': 'total_yards', 'away_turnovers': 'turnovers',
               'away_yards_from_penalties':'yards_from_penalties', 'away_yards_lost_from_sacks': 'yards_lost_from_sacks'})

        home_stats_df = game_stats.dataframe[['home_first_downs', 'home_fourth_down_attempts',
               'home_fourth_down_conversions', 'home_fumbles', 'home_fumbles_lost',
               'home_interceptions', 'home_net_pass_yards', 'home_pass_attempts',
               'home_pass_completions', 'home_pass_touchdowns', 'home_pass_yards',
               'home_penalties', 'home_points', 'home_rush_attempts',
               'home_rush_touchdowns', 'home_rush_yards', 'home_third_down_attempts',
               'home_third_down_conversions', 'home_time_of_possession',
               'home_times_sacked', 'home_total_yards', 'home_turnovers',
               'home_yards_from_penalties', 'home_yards_lost_from_sacks']].reset_index().drop(columns = 'index').rename(columns = {
               'home_first_downs': 'first_downs', 'home_fourth_down_attempts':'fourth_down_attempts',
               'home_fourth_down_conversions':'fourth_down_conversions' , 'home_fumbles': 'fumbles', 'home_fumbles_lost': 'fumbles_lost',
               'home_interceptions': 'interceptions', 'home_net_pass_yards':'net_pass_yards' , 'home_pass_attempts': 'pass_attempts',
               'home_pass_completions':'pass_completions' , 'home_pass_touchdowns': 'pass_touchdowns', 'home_pass_yards': 'pass_yards',
               'home_penalties': 'penalties', 'home_points': 'points', 'home_rush_attempts': 'rush_attempts',
               'home_rush_touchdowns': 'rush_touchdowns', 'home_rush_yards': 'rush_yards', 'home_third_down_attempts': 'third_down_attempts',
               'home_third_down_conversions': 'third_down_conversions', 'home_time_of_possession': 'time_of_possession',
               'home_times_sacked': 'times_sacked', 'home_total_yards': 'total_yards', 'home_turnovers': 'turnovers',
               'home_yards_from_penalties':'yards_from_penalties', 'home_yards_lost_from_sacks': 'yards_lost_from_sacks'})

        away_team_df = pd.merge(away_team_df, away_stats_df,left_index = True, right_index = True)
        home_team_df = pd.merge(home_team_df, home_stats_df,left_index = True, right_index = True)
        try:
            away_team_df['time_of_possession'] = (int(away_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(away_team_df['time_of_possession'].loc[0][3:5])
            home_team_df['time_of_possession'] = (int(home_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(home_team_df['time_of_possession'].loc[0][3:5])
        except TypeError:
            away_team_df['time_of_possession'] = np.nan
            home_team_df['time_of_possession'] = np.nan
    except TypeError:
        away_team_df = pd.DataFrame()
        home_team_df = pd.DataFrame()
    return away_team_df, home_team_df

In [6]:
def game_data_up_to_week(weeks,year):
    weeks_games_df = pd.DataFrame()
    for w in range(len(weeks)):
        date_string = str(weeks[w]) + '-' + str(year)
        week_scores = Boxscores(weeks[w],year)
        week_games_df = pd.DataFrame()
        for g in range(len(week_scores.games[date_string])):
            game_str = week_scores.games[date_string][g]['boxscore']
            game_stats = Boxscore(game_str)
            game_df = pd.DataFrame(week_scores.games[date_string][g], index = [0])
            away_team_df, home_team_df = game_data(game_df,game_stats)
            away_team_df['week'] = weeks[w]
            home_team_df['week'] = weeks[w]
            week_games_df = pd.concat([week_games_df,away_team_df])
            week_games_df = pd.concat([week_games_df,home_team_df])
        weeks_games_df = pd.concat([weeks_games_df,week_games_df])
    return weeks_games_df

In [59]:
df_1995 = game_data_up_to_week(range(1, 16), 1995)

In [60]:
df_2000 = game_data_up_to_week(range(1, 16), 2000)

In [61]:
df_2005 = game_data_up_to_week(range(1, 16), 2005)

In [62]:
df_2010 = game_data_up_to_week(range(1, 16), 2010)

In [63]:
df_2015 = game_data_up_to_week(range(1, 16), 2015)

In [64]:
df_2019 = game_data_up_to_week(range(1,16), 2019)

In [65]:
# 1995 Season
y_1995 = df_1995["game_won"]
x_1995 = df_1995[list(df_1995.columns)[5:len(df_1995)]]

# 2000 Season
y_2000 = df_2000["game_won"]
x_2000 = df_2000[list(df_2000.columns)[5:len(df_2000)]]

# 2005 Season
y_2005 = df_2005["game_won"]
x_2005 = df_2005[list(df_2005.columns)[5:len(df_2005)]]

# 2010 Season
y_2010 = df_2010["game_won"]
x_2010 = df_2010[list(df_2010.columns)[5:len(df_2010)]]

# 2015 Season
y_2015 = df_2015["game_won"]
x_2015 = df_2015[list(df_2015.columns)[5:len(df_2015)]]

# 2019 Season
y_2019 = df_2019["game_won"]
x_2019 = df_2019[list(df_2019.columns)[5:len(df_2019)]]

In [66]:
# 1995 Season Data split
x_1995_train, x_1995_test, y_1995_train, y_1995_test = train_test_split(x_1995, y_1995)

# 2000 Season Data split
x_2000_train, x_2000_test, y_2000_train, y_2000_test = train_test_split(x_2000, y_2000)

# 2005 Season Data split
x_2005_train, x_2005_test, y_2005_train, y_2005_test = train_test_split(x_2005, y_2005)

# 2010 Season Data split
x_2010_train, x_2010_test, y_2010_train, y_2010_test = train_test_split(x_2010, y_2010)

# 2015 Season Data split
x_2015_train, x_2015_test, y_2015_train, y_2015_test = train_test_split(x_2015, y_2015)

# 2019 Season Data split
x_2019_train, x_2019_test, y_2019_train, y_2019_test = train_test_split(x_2019, y_2019)

In [79]:
# Set up model for 1995 season
clf_1995 = LogisticRegression(max_iter=5000)
clf_1995.fit(x_1995_train, y_1995_train)

# Set up model for 2000 season
clf_2000 = LogisticRegression(max_iter=5000)
clf_2000.fit(x_2000_train, y_2000_train)

# Set up model for 2005 season
clf_2005 = LogisticRegression(max_iter=5000)
clf_2005.fit(x_2005_train, y_2005_train)

# Set up model for 2010 season
clf_2010 = LogisticRegression(max_iter=5000)
clf_2010.fit(x_2010_train, y_2010_train)

# Set up model for 2015 season
clf_2015 = LogisticRegression(max_iter=5000)
clf_2015.fit(x_2015_train, y_2015_train)

# Set up model for 2019 season
clf_2019 = LogisticRegression(max_iter=5000)
clf_2019.fit(x_2019_train, y_2019_train)

LogisticRegression(max_iter=5000)

In [94]:
# Predict 1995 games based on 1995 model
clf_1995.predict(x_1995_test)
print("1995 score:", clf_1995.score(x_1995_test, y_1995_test))

# Predict 2000 games based on 2000 model
clf_2000.predict(x_2000_test)
print("2000 score:", clf_2000.score(x_2000_test, y_2000_test))

# Predict 2005 games based on 2005 model
clf_2005.predict(x_2005_test)
print("2005 score:", clf_2005.score(x_2005_test, y_2005_test))

# Predict 2010 games based on 2010 model
clf_2010.predict(x_2010_test)
print("2010 score:", clf_2010.score(x_2010_test, y_2010_test))

# Predict 2015 games based on 2015 model
clf_2015.predict(x_2015_test)
print("2015 score:", clf_2015.score(x_2015_test, y_2015_test))

# Predict 2019 games based on 2019 model
clf_2019.predict(x_2019_test)
print("2019 score:", clf_2019.score(x_2019_test, y_2019_test))

1995 score: 0.9142857142857143
2000 score: 0.8807339449541285
2005 score: 0.8214285714285714
2010 score: 0.8660714285714286
2015 score: 0.8392857142857143
2019 score: 0.8303571428571429
