In [1]:
# Import depedencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [2]:
# Read the CSV data and create a new dataframe

df = pd.read_csv("match_team_data.csv")

In [3]:
# Create a column to account for Wins, Losses, and Ties

df["home_win"] = np.where(df["home_score"] > df["away_score"], 1,0)

In [4]:
# Check for NaNs
df.isna().mean().sort_values(ascending=False).head(10)

team_2_post_season_result                1.000000
team_1_post_season_result                1.000000
winning_abbr                             0.079338
winning_name                             0.079338
losing_name                              0.079338
losing_abbr                              0.079338
away_score                               0.075625
home_score                               0.075625
team_2_pass_completions                  0.000000
team_2_offensive_simple_rating_system    0.000000
dtype: float64

In [5]:
df["winning_name"].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
2957     True
2958     True
2959     True
2960     True
2961     True
Name: winning_name, Length: 2962, dtype: bool

In [6]:
# Remove NaNs
df = df.drop(columns=["team_2_post_season_result","team_1_post_season_result"])

In [7]:
df3 = df.dropna()

In [8]:
df3

Unnamed: 0,boxscore,away_name,away_abbr,away_score,home_name,home_abbr,home_score,winning_name,winning_abbr,losing_name,...,team_2_rush_yards_per_attempt,team_2_simple_rating_system,team_2_strength_of_schedule,team_2_turnovers,team_2_win_percentage,team_2_wins,team_2_yards,team_2_yards_from_penalties,team_2_yards_per_play,home_win
0,201209050nyg,Dallas Cowboys,dal,24.0,New York Giants,nyg,17.0,Dallas Cowboys,dal,New York Giants,...,4.7,-4.2,-2.2,2,0.667,2,972,188,5.3,0
1,201209090chi,Indianapolis Colts,clt,21.0,Chicago Bears,chi,41.0,Chicago Bears,chi,Indianapolis Colts,...,4.0,-4.8,2.2,6,0.500,1,994,153,4.8,1
2,201209090cle,Philadelphia Eagles,phi,17.0,Cleveland Browns,cle,16.0,Philadelphia Eagles,phi,Cleveland Browns,...,4.4,38.9,26.9,1,1.000,3,1341,173,6.4,0
3,201209090det,St. Louis Rams,ram,23.0,Detroit Lions,det,27.0,Detroit Lions,det,St. Louis Rams,...,3.4,-9.0,-6.0,7,0.667,2,919,126,5.3,1
4,201209090htx,Miami Dolphins,mia,10.0,Houston Texans,htx,30.0,Houston Texans,htx,Miami Dolphins,...,3.3,12.4,6.1,2,1.000,3,1066,91,6.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,202209250crd,Los Angeles Rams,ram,20.0,Arizona Cardinals,crd,12.0,Los Angeles Rams,ram,Arizona Cardinals,...,3.4,-9.0,-6.0,7,0.667,2,919,126,5.3,0
2734,202209250sea,Atlanta Falcons,atl,27.0,Seattle Seahawks,sea,23.0,Atlanta Falcons,atl,Seattle Seahawks,...,4.9,-11.3,-10.9,7,0.333,1,1063,99,5.9,0
2735,202209250tam,Green Bay Packers,gnb,14.0,Tampa Bay Buccaneers,tam,12.0,Green Bay Packers,gnb,Tampa Bay Buccaneers,...,4.7,12.5,11.5,5,0.667,2,1067,100,5.7,0
2736,202209250den,San Francisco 49ers,sfo,10.0,Denver Broncos,den,11.0,Denver Broncos,den,San Francisco 49ers,...,4.5,-4.8,-8.2,5,0.333,1,971,165,5.1,1


In [9]:
# Check for categorical data

df3.dtypes.sort_values().tail(15)

team_1_points_contributed_by_offense    float64
away_name                                object
away_abbr                                object
home_name                                object
home_abbr                                object
team_2_abbreviation                      object
losing_name                              object
winning_abbr                             object
losing_abbr                              object
teams                                    object
team_1_abbreviation                      object
team_2_name                              object
team_1_name                              object
winning_name                             object
boxscore                                 object
dtype: object

In [10]:
# Remove categorical data

df3.drop(columns=["away_name","away_abbr","home_name","home_abbr","team_2_abbreviation","losing_name","winning_abbr","losing_abbr","teams","team_1_abbreviation","team_2_name","team_1_name","winning_name","boxscore"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(columns=["away_name","away_abbr","home_name","home_abbr","team_2_abbreviation","losing_name","winning_abbr","losing_abbr","teams","team_1_abbreviation","team_2_name","team_1_name","winning_name","boxscore"], inplace=True)


In [11]:
# Remove categorical data

df3.drop(columns=["year","week","home_score","away_score"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.drop(columns=["year","week","home_score","away_score"], inplace=True)


In [12]:
# Create target and features

y = df3["home_win"]
X = df3.drop(columns="home_win")

In [13]:
# Apply Train_Test_Split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [14]:
# Set the scaler
scaler = StandardScaler()

In [15]:
# Fit and Transform the data
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
from xgboost import XGBClassifier

In [17]:
# Create the XGB model
model = XGBClassifier()

In [18]:
# Fit the data
model.fit(X_train,y_train)

In [19]:
# Make the predictions
y_pred = model.predict(X_test)

In [20]:
# print the results

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[141 157]
 [156 228]]
              precision    recall  f1-score   support

           0       0.47      0.47      0.47       298
           1       0.59      0.59      0.59       384

    accuracy                           0.54       682
   macro avg       0.53      0.53      0.53       682
weighted avg       0.54      0.54      0.54       682

0.5410557184750733


In [21]:
from collections import Counter 

In [22]:
import imblearn 

In [23]:
from imblearn.over_sampling import RandomOverSampler

In [25]:
Counter(y_train)

Counter({0: 909, 1: 1136})

In [24]:
ros = RandomOverSampler(random_state=1)

In [27]:
X_train_oversampled,y_train_oversampled = ros.fit_resample(X_train, y_train) 

In [28]:
model.fit(X_train_oversampled,y_train_oversampled)

In [29]:
y_pred = model.predict(X_test)

In [30]:
from imblearn.metrics import classification_report_imbalanced

In [31]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.51      0.55      0.49      0.53      0.28       298
          1       0.59      0.55      0.51      0.57      0.53      0.28       384

avg / total       0.54      0.53      0.53      0.54      0.53      0.28       682

