In [66]:
import pandas as pd
import numpy as np
import requests
import os
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [88]:
#Import and convert json to pd.df
df = pd.read_json("espnscores.json")

#Print basic information
print(df.head())
df.describe(include="all")

   season  week awayteam  hometeam  awayscore  homescore  \
0    2005     1  Raiders  Patriots         20         30   
1    2005     1   Texans     Bills          7         22   
2    2005     1  Bengals    Browns         27         13   
3    2005     1     Jets    Chiefs          7         27   
4    2005     1  Broncos  Dolphins         10         34   

                                           boxscore       idgame  
0  https://espn.com/nfl/boxscore/_/gameId/250908017  [250908017]  
1  https://espn.com/nfl/boxscore/_/gameId/250911002  [250911002]  
2  https://espn.com/nfl/boxscore/_/gameId/250911005  [250911005]  
3  https://espn.com/nfl/boxscore/_/gameId/250911012  [250911012]  
4  https://espn.com/nfl/boxscore/_/gameId/250911015  [250911015]  


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,boxscore,idgame
count,5367.0,5367.0,5367,5367,5367.0,5367.0,5367,5367
unique,,,33,33,,,5367,5367
top,,,Vikings,Bengals,,,https://espn.com/nfl/boxscore/_/gameId/250908017,[250908017]
freq,,,169,169,,,1,1
mean,2011.034097,9.115893,,,21.051239,23.227688,,
std,6.069954,5.013696,,,10.127283,10.321967,,
min,2001.0,1.0,,,0.0,0.0,,
25%,2006.0,5.0,,,14.0,16.0,,
50%,2011.0,9.0,,,20.0,23.0,,
75%,2016.0,14.0,,,27.0,30.0,,


In [90]:
# Pre-processing of the NFL Json data 

# Actions:

df.insert(df.shape[1],"score_abs",df["homescore"] - df["awayscore"])

#   - Set winner based on scores, 1 for away team, 0 for away team, 999999999 for draw:

df.insert(df.shape[1],"winner_home", 0)
for i in range(len(df["winner_home"])):
    if df["score_abs"][i] > 0 :
        df["winner_home"][i] = 1
    elif df["score_abs"][i] < 0 :
        df["winner_home"][i] = 0
    else :
        df["winner_home"][i] = 999999999 #Value to filter and remove later on as values are not relevant vs amount of data


df.insert(df.shape[1],"winner_away", 0)
for i in range(len(df["winner_away"])):
    if df["score_abs"][i] < 0 :
        df["winner_away"][i] = 1
    elif df["score_abs"][i] > 0 :
        df["winner_away"][i] = 0
    else :
        df["winner_away"][i] = 999999999 #Value to filter and remove later on as values are not relevant vs amount of data



#   - Remove draw result lines as they have no real impact

print("Values before draw games (value 999999999) clean up: \n")
print(df.value_counts("winner_home"))
print(df.value_counts("winner_away"))

drop_lines = df[df["score_abs"] == 0].index
df = df.drop(drop_lines, axis=0)

print("\n Values before draw games (value 999999999) clean up: \n")
print(df.value_counts("winner_home"))
print(df.value_counts("winner_away"))

#   - Remove boxscore column, no practical information:

df.drop('boxscore', axis=1, inplace=True)


#   - Remove brackets from idgame, turn field into integer:

df["idgame"] = df["idgame"].apply(lambda x : str(x).strip("[").strip("]").strip("'")).astype(int)


#   - Redskins changed their name to Washington. Replacing the former with the latter

df["awayteam"].replace("Redskins", "Washington", inplace=True)
df["hometeam"].replace("Redskins", "Washington", inplace=True)

#   - Reset the index
df = df.reset_index(drop=True)


#   - Delete useless variables
del drop_lines

#   - Create working file

os.makedirs('working_files', exist_ok=True)  
df.to_csv('working_files/scores_prep.csv')  

df

Values before draw games (value 999999999) clean up: 

winner_home
1            3014
0            2341
999999999      12
dtype: int64
winner_away
0            3014
1            2341
999999999      12
dtype: int64

 Values before draw games (value 999999999) clean up: 

winner_home
1    3014
0    2341
dtype: int64
winner_away
0    3014
1    2341
dtype: int64


Unnamed: 0,season,week,awayteam,hometeam,awayscore,homescore,idgame,score_abs,winner_home,winner_away
0,2005,1,Raiders,Patriots,20,30,250908017,10,1,0
1,2005,1,Texans,Bills,7,22,250911002,15,1,0
2,2005,1,Bengals,Browns,27,13,250911005,-14,0,1
3,2005,1,Jets,Chiefs,7,27,250911012,20,1,0
4,2005,1,Broncos,Dolphins,10,34,250911015,24,1,0
...,...,...,...,...,...,...,...,...,...,...
5350,2021,18,49ers,Rams,27,24,401326599,-3,0,1
5351,2021,18,Patriots,Dolphins,24,33,401326592,9,1,0
5352,2021,18,Seahawks,Cardinals,38,30,401326597,-8,0,1
5353,2021,18,Panthers,Buccaneers,17,41,401326596,24,1,0


In [91]:
#CREATING TEAMS LIST

#Based on the games played, append both lists "away" and "home" teams even though they both should be the same

df_away = pd.DataFrame(df["awayteam"])
df_away.rename(columns = {"awayteam":'team'}, inplace = True)
df_home = pd.DataFrame(df["hometeam"])
df_home.rename(columns = {"hometeam":'team'}, inplace = True)
df_teams = df_away.append(df_home)

#Sort values
df_teams.sort_values(by="team", inplace = True)

#Drop duplicates
df_teams.drop_duplicates(subset ="team", keep = "first", inplace = True)

#Reset the index
df_teams = df_teams.reset_index(drop=True)

#Delete useless variables
del df_away
del df_home

#Create working file

os.makedirs('working_files', exist_ok=True)  
df_teams.to_csv('working_files/team_list_prep.csv') 

df_teams

  df_teams = df_away.append(df_home)


Unnamed: 0,team
0,49ers
1,Bears
2,Bengals
3,Bills
4,Broncos
5,Browns
6,Buccaneers
7,Cardinals
8,Chargers
9,Chiefs
