In [1]:
import pandas as pd
import numpy as np
import requests
import os
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
#Import and convert json to pd.df
df = pd.read_json("espnscores.json")

#Print basic information
print(df.head())
df.describe(include="all")

   season  week awayteam  hometeam  awayscore  homescore  \
0    2005     1  Raiders  Patriots         20         30   
1    2005     1   Texans     Bills          7         22   
2    2005     1  Bengals    Browns         27         13   
3    2005     1     Jets    Chiefs          7         27   
4    2005     1  Broncos  Dolphins         10         34   

                                           boxscore       idgame  
0  https://espn.com/nfl/boxscore/_/gameId/250908017  [250908017]  
1  https://espn.com/nfl/boxscore/_/gameId/250911002  [250911002]  
2  https://espn.com/nfl/boxscore/_/gameId/250911005  [250911005]  
3  https://espn.com/nfl/boxscore/_/gameId/250911012  [250911012]  
4  https://espn.com/nfl/boxscore/_/gameId/250911015  [250911015]  


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,boxscore,idgame
count,5367.0,5367.0,5367,5367,5367.0,5367.0,5367,5367
unique,,,33,33,,,5367,5367
top,,,Vikings,Bengals,,,https://espn.com/nfl/boxscore/_/gameId/250908017,[250908017]
freq,,,169,169,,,1,1
mean,2011.034097,9.115893,,,21.051239,23.227688,,
std,6.069954,5.013696,,,10.127283,10.321967,,
min,2001.0,1.0,,,0.0,0.0,,
25%,2006.0,5.0,,,14.0,16.0,,
50%,2011.0,9.0,,,20.0,23.0,,
75%,2016.0,14.0,,,27.0,30.0,,


In [4]:
# Pre-processing of the NFL Json data 

# Actions:

df.insert(df.shape[1],"score_abs",df["homescore"] - df["awayscore"])

#   - Set winner based on scores, 1 for away team, 0 for away team, 999999999 for draw:

df.insert(df.shape[1],"winner_home", 0)
for i in range(len(df["winner_home"])):
    if df["score_abs"][i] > 0 :
        df["winner_home"][i] = 1
    elif df["score_abs"][i] < 0 :
        df["winner_home"][i] = 0
    else :
        df["winner_home"][i] = 999999999 #Value to filter and remove later on as values are not relevant vs amount of data


df.insert(df.shape[1],"winner_away", 0)
for i in range(len(df["winner_away"])):
    if df["score_abs"][i] < 0 :
        df["winner_away"][i] = 1
    elif df["score_abs"][i] > 0 :
        df["winner_away"][i] = 0
    else :
        df["winner_away"][i] = 999999999 #Value to filter and remove later on as values are not relevant vs amount of data



#   - Remove draw result lines as they have no real impact

print("Values before draw games (value 999999999) clean up: \n")
print(df.value_counts("winner_home"))
print(df.value_counts("winner_away"))

drop_lines = df[df["score_abs"] == 0].index
df = df.drop(drop_lines, axis=0)

print("\n Values before draw games (value 999999999) clean up: \n")
print(df.value_counts("winner_home"))
print(df.value_counts("winner_away"))

#   - Remove boxscore column, no practical information:

df.drop('boxscore', axis=1, inplace=True)


#   - Remove brackets from idgame, turn field into integer:

df["idgame"] = df["idgame"].apply(lambda x : str(x).strip("[").strip("]").strip("'")).astype(int)


#   - Redskins changed their name to Washington. Replacing the former with the latter

df["awayteam"].replace("Redskins", "Washington", inplace=True)
df["hometeam"].replace("Redskins", "Washington", inplace=True)

#   - Reset the index
df = df.reset_index(drop=True)


#   - Delete useless variables
del drop_lines

#   - Create working file

os.makedirs('working_files', exist_ok=True)  
df.to_csv('working_files/scores_prep.csv')  

df

Values before draw games (value 999999999) clean up: 

winner_home
1            3014
0            2341
999999999      12
dtype: int64
winner_away
0            3014
1            2341
999999999      12
dtype: int64

 Values before draw games (value 999999999) clean up: 

winner_home
1    3014
0    2341
dtype: int64
winner_away
0    3014
1    2341
dtype: int64


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away
0,2005,1,Raiders,Patriots,20,30,250908017,10,1,0
1,2005,1,Texans,Bills,7,22,250911002,15,1,0
2,2005,1,Bengals,Browns,27,13,250911005,-14,0,1
3,2005,1,Jets,Chiefs,7,27,250911012,20,1,0
4,2005,1,Broncos,Dolphins,10,34,250911015,24,1,0
...,...,...,...,...,...,...,...,...,...,...
5350,2021,18,49ers,Rams,27,24,401326599,-3,0,1
5351,2021,18,Patriots,Dolphins,24,33,401326592,9,1,0
5352,2021,18,Seahawks,Cardinals,38,30,401326597,-8,0,1
5353,2021,18,Panthers,Buccaneers,17,41,401326596,24,1,0


In [5]:
#CREATING TEAMS LIST

#Based on the games played, append both lists "away" and "home" teams even though they both should be the same

df_away = pd.DataFrame(df["awayteam"])
df_away.rename(columns = {"awayteam":'team'}, inplace = True)
df_home = pd.DataFrame(df["hometeam"])
df_home.rename(columns = {"hometeam":'team'}, inplace = True)
df_teams = df_away.append(df_home)

#Sort values
df_teams.sort_values(by="team", inplace = True)

#Drop duplicates
df_teams.drop_duplicates(subset ="team", keep = "first", inplace = True)

#Reset the index
df_teams = df_teams.reset_index(drop=True)

#Delete useless variables
del df_away
del df_home

#Create working file

os.makedirs('working_files', exist_ok=True)  
df_teams.to_csv('working_files/team_list_prep.csv') 

df_teams

  df_teams = df_away.append(df_home)


Unnamed: 0,team
0,49ers
1,Bears
2,Bengals
3,Bills
4,Broncos
5,Browns
6,Buccaneers
7,Cardinals
8,Chargers
9,Chiefs


In [6]:
df.head()

Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away
0,2005,1,Raiders,Patriots,20,30,250908017,10,1,0
1,2005,1,Texans,Bills,7,22,250911002,15,1,0
2,2005,1,Bengals,Browns,27,13,250911005,-14,0,1
3,2005,1,Jets,Chiefs,7,27,250911012,20,1,0
4,2005,1,Broncos,Dolphins,10,34,250911015,24,1,0


In [7]:
X = df.loc[:,["season","week", "awayteam", "hometeam"]]
y = df.loc[:,"winner_home"]
print(X.head())
print(y.head())

   season  week awayteam  hometeam
0    2005     1  Raiders  Patriots
1    2005     1   Texans     Bills
2    2005     1  Bengals    Browns
3    2005     1     Jets    Chiefs
4    2005     1  Broncos  Dolphins
0    1
1    1
2    0
3    1
4    1
Name: winner_home, dtype: int64


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 42)

In [9]:
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [10]:
one = OneHotEncoder()
sc = StandardScaler()

In [11]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'Cardinals'

In [None]:
X.describe(include = "all")

Unnamed: 0,season,week,awayteam,hometeam
count,5355.0,5355.0,5355,5355
unique,,,32,32
top,,,49ers,Ravens
freq,,,169,169
mean,2011.02577,9.121195,,
std,6.069151,5.014666,,
min,2001.0,1.0,,
25%,2006.0,5.0,,
50%,2011.0,9.0,,
75%,2016.0,14.0,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5355 entries, 0 to 5354
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   season       5355 non-null   int64 
 1   week         5355 non-null   int64 
 2   awayteam     5355 non-null   object
 3   hometeam     5355 non-null   object
 4   awayscore    5355 non-null   int64 
 5   homescore    5355 non-null   int64 
 6   idgame       5355 non-null   int32 
 7   score_abs    5355 non-null   int64 
 8   winner_home  5355 non-null   int64 
 9   winner_away  5355 non-null   int64 
dtypes: int32(1), int64(7), object(2)
memory usage: 397.6+ KB


In [12]:
numeric_features = X.select_dtypes(["int64"]).columns 
type(numeric_features)

pandas.core.indexes.base.Index

In [13]:
# Create pipeline for numeric features
numeric_features = X.select_dtypes([np.number]).columns # numeric columns in X_train/X_test
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = X.select_dtypes("object").columns # categorical columns in X_train/X_test
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop="first")) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test) # Don't fit again !!

In [14]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [15]:
classifier.score(X_test, y_test)

0.6031746031746031

In [16]:
scores = cross_val_score(classifier, X_train, y_train)
print("Cross-validated score : {}\nStd : {}".format(scores.mean(), scores.std()))

Cross-validated score : 0.58076205847392
Std : 0.010821985356253547


In [17]:
d = {'season': 2022, 'week': 1, 'awayteam': 'Raiders', 	'hometeam' : 'Jets'}

first_week =pd.DataFrame(data=d, index=[0])

In [18]:
first_week_trans = preprocessor.transform(first_week)
first_week_trans

<1x64 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [43]:
predict_test = classifier.predict(first_week_trans)
predict_test

array([1], dtype=int64)

In [19]:
predict_test = classifier.predict_proba(first_week_trans)
predict_test

array([[0.45066821, 0.54933179]])