# Capstone Project 2

## Predict who will win an NFL bet

Data Source:  
    Main/Historical https://datasetsearch.research.google.com/search?query=NFL%20scores%20and%20betting%20data&docid=42gvYWceSRPsddH5AAAAAA%3D%3D

    Current Week Odds:     https://www.vegasinsider.com/nfl/odds/las-vegas
    Current Year Results:  https://www.teamrankings.com/nfl-odds-week-#  (where # is week number)
    
    

In [1]:
# packages
import os
import pandas as pd
import numpy  as np
import csv
import datetime
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
    import sklearn

# required machine learning packages
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV as CCV

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
#import xgboost as xgb

In [2]:
datapath = "input/"

teams      = pd.read_csv(datapath + "nfl_teams.csv")
scores     = pd.read_csv(datapath + "nfl_scores.csv")
stadiums   = pd.read_csv(datapath + "nfl_stadiums.csv", encoding='latin-1')
scores2020 = pd.read_csv(datapath + "nfl_scores_2020.csv")

In [3]:
stadiums.info()
stadiums.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   stadium_name                  100 non-null    object 
 1   stadium_location              100 non-null    object 
 2   stadium_open                  76 non-null     float64
 3   stadium_close                 41 non-null     float64
 4   stadium_type                  93 non-null     object 
 5   stadium_address               91 non-null     object 
 6   stadium_weather_station_code  90 non-null     object 
 7   stadium_weather_type          93 non-null     object 
 8   stadium_capacity              39 non-null     object 
 9   stadium_surface               53 non-null     object 
 10  STATION                       52 non-null     object 
 11  NAME                          52 non-null     object 
 12  LATITUDE                      52 non-null     float64
 13  LONGIT

Unnamed: 0,stadium_name,stadium_location,stadium_open,stadium_close,stadium_type,stadium_address,stadium_weather_station_code,stadium_weather_type,stadium_capacity,stadium_surface,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION
0,Alamo Dome,"San Antonio, TX",,,indoor,"100 Montana St, San Antonio, TX 78203",78203.0,dome,72000.0,FieldTurf,,,,,
1,Alltel Stadium,"Jacksonville, FL",,,,,,,,,,,,,
2,Alumni Stadium,"Chestnut Hill, MA",,,outdoor,"Perimeter Rd, Chestnut Hill, MA 02467",2467.0,cold,,Grass,,,,,
3,Anaheim Stadium,"Anaheim, CA",1980.0,1994.0,outdoor,"2000 E Gene Autry Way, Anaheim, CA 92806",92806.0,warm,,,,,,,
4,Arrowhead Stadium,"Kansas City, MO",1972.0,,outdoor,"1 Arrowhead Dr, Kansas City, MO 64129",64129.0,cold,76416.0,Grass,US1MOJC0028,"KANSAS CITY 5.1 SE, MO US",39.0692,-94.4871,264.9


In [4]:
teams.info()
teams.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   team_name                43 non-null     object
 1   team_name_short          43 non-null     object
 2   team_id                  43 non-null     object
 3   team_id_pfr              43 non-null     object
 4   team_conference          43 non-null     object
 5   team_division            34 non-null     object
 6   team_conference_pre2002  43 non-null     object
 7   team_division_pre2002    41 non-null     object
dtypes: object(8)
memory usage: 2.8+ KB


Unnamed: 0,team_name,team_name_short,team_id,team_id_pfr,team_conference,team_division,team_conference_pre2002,team_division_pre2002
0,Arizona Cardinals,Cardinals,ARI,CRD,NFC,NFC West,NFC,NFC West
1,Phoenix Cardinals,Cardinals,ARI,CRD,NFC,,NFC,NFC East
2,St. Louis Cardinals,Cardinals,ARI,ARI,NFC,,NFC,NFC East
3,Atlanta Falcons,Falcons,ATL,ATL,NFC,NFC South,NFC,NFC West
4,Baltimore Ravens,Ravens,BAL,RAV,AFC,AFC North,AFC,AFC Central


In [5]:
scores.info()
scores.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12678 entries, 0 to 12677
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   schedule_date        12678 non-null  object 
 1   schedule_season      12678 non-null  int64  
 2   schedule_week        12678 non-null  object 
 3   schedule_playoff     12678 non-null  bool   
 4   team_home            12678 non-null  object 
 5   score_home           12678 non-null  int64  
 6   score_away           12678 non-null  int64  
 7   team_away            12678 non-null  object 
 8   team_favorite_id     10199 non-null  object 
 9   spread_favorite      10199 non-null  float64
 10  over_under_line      10189 non-null  object 
 11  stadium              12678 non-null  object 
 12  stadium_neutral      12678 non-null  bool   
 13  weather_temperature  11936 non-null  float64
 14  weather_wind_mph     11936 non-null  float64
 15  weather_humidity     8388 non-null  

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,9/2/1966,1966,1,False,Miami Dolphins,14,23,Oakland Raiders,,,,Orange Bowl,False,83.0,6.0,71,
1,9/3/1966,1966,1,False,Houston Oilers,45,7,Denver Broncos,,,,Rice Stadium,False,81.0,7.0,70,
2,9/4/1966,1966,1,False,San Diego Chargers,27,7,Buffalo Bills,,,,Balboa Stadium,False,70.0,7.0,82,
3,9/9/1966,1966,2,False,Miami Dolphins,14,19,New York Jets,,,,Orange Bowl,False,82.0,11.0,78,
4,9/10/1966,1966,1,False,Green Bay Packers,24,3,Baltimore Colts,,,,Lambeau Field,False,64.0,8.0,62,


In [6]:
scores2020.info()
scores2020.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   schedule_date        133 non-null    object 
 1   schedule_season      133 non-null    int64  
 2   schedule_week        133 non-null    int64  
 3   schedule_playoff     133 non-null    bool   
 4   team_home            133 non-null    object 
 5   score_home           133 non-null    int64  
 6   score_away           133 non-null    int64  
 7   team_away            133 non-null    object 
 8   team_favorite_id     133 non-null    object 
 9   spread_favorite      133 non-null    float64
 10  over_under_line      133 non-null    float64
 11  stadium              0 non-null      float64
 12  stadium_neutral      0 non-null      float64
 13  weather_temperature  0 non-null      float64
 14  weather_wind_mph     0 non-null      float64
 15  weather_humidity     0 non-null      flo

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,9/10/2020,2020,1,False,Kansas City,34,20,Houston,KC,-9.5,53.5,,,,,,
1,9/13/2020,2020,1,False,Detroit,23,27,Chicago,DET,-2.5,42.5,,,,,,
2,9/13/2020,2020,1,False,Baltimore,38,6,Cleveland,BAL,-7.0,47.0,,,,,,
3,9/13/2020,2020,1,False,Washington,27,17,Philadelphia,PHI,5.5,41.5,,,,,,
4,9/13/2020,2020,1,False,Minnesota,34,43,Green Bay,MIN,-1.0,44.0,,,,,,
