In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

## Data directory

In [3]:
sports = 'NFL'

In [4]:
data_directory = os.path.join('..','data','sports_data/{}/nfl10clean.csv'.format(sports))
data_directory_saves = os.path.join( '..','data','clean_data','sports_data/{}/'.format(sports))

In [5]:
df = pd.read_csv(data_directory)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4593 entries, 0 to 4592
Data columns (total 6 columns):
scheduled              4593 non-null object
home.alias             4593 non-null object
scoring.home_points    4592 non-null float64
away.alias             4593 non-null object
scoring.away_points    4592 non-null float64
WIN                    4593 non-null object
dtypes: float64(2), object(4)
memory usage: 215.4+ KB


In [7]:
df.head()

Unnamed: 0,scheduled,home.alias,scoring.home_points,away.alias,scoring.away_points,WIN
0,2000-09-03 18:03:36,NO,10.0,DET,14.0,DET
1,2000-09-03 18:04:35,NE,16.0,TB,21.0,TB
2,2000-09-03 21:15:54,MIA,23.0,SEA,0.0,MIA
3,2000-09-03 21:20:12,GB,16.0,NYJ,20.0,NYJ
4,2000-09-03 19:02:58,WAS,20.0,CAR,17.0,WAS


## change scheduled to datetime

In [8]:
df.scheduled = pd.to_datetime(df.scheduled)
df = df.set_index('scheduled').sort_index(ascending=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4593 entries, 2000-09-03 00:00:00 to 2017-12-31 21:25:00
Data columns (total 5 columns):
home.alias             4593 non-null object
scoring.home_points    4592 non-null float64
away.alias             4593 non-null object
scoring.away_points    4592 non-null float64
WIN                    4593 non-null object
dtypes: float64(2), object(3)
memory usage: 215.3+ KB


In [10]:
df.head()

Unnamed: 0_level_0,home.alias,scoring.home_points,away.alias,scoring.away_points,WIN
scheduled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-09-03 00:00:00,PIT,0.0,BAL,16.0,BAL
2000-09-03 00:00:00,OAK,9.0,SD,6.0,OAK
2000-09-03 18:02:00,ATL,36.0,SF,28.0,ATL
2000-09-03 18:03:33,NYG,21.0,ARI,16.0,NYG
2000-09-03 18:03:36,NO,10.0,DET,14.0,DET


## Select HOU home games

In [11]:
home_games = df['home.alias'] == 'HOU'
df = df[home_games]

In [12]:
df.head()

Unnamed: 0_level_0,home.alias,scoring.home_points,away.alias,scoring.away_points,WIN
scheduled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-09-09 00:36:34,HOU,19.0,DAL,10.0,HOU
2002-09-22 17:02:37,HOU,3.0,IND,23.0,IND
2002-10-13 17:02:21,HOU,24.0,BUF,31.0,BUF
2002-11-03 18:04:07,HOU,3.0,CIN,38.0,CIN
2002-11-17 18:02:29,HOU,21.0,JAC,24.0,JAC


# select games from 2010-2017

In [13]:
df = df['2010':'2017']

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 65 entries, 2010-01-03 18:02:38 to 2017-12-25 21:30:00
Data columns (total 5 columns):
home.alias             65 non-null object
scoring.home_points    65 non-null float64
away.alias             65 non-null object
scoring.away_points    65 non-null float64
WIN                    65 non-null object
dtypes: float64(2), object(3)
memory usage: 3.0+ KB


In [15]:
df.head()

Unnamed: 0_level_0,home.alias,scoring.home_points,away.alias,scoring.away_points,WIN
scheduled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-03 18:02:38,HOU,34.0,NE,27.0,HOU
2010-09-12 17:02:50,HOU,34.0,IND,24.0,HOU
2010-09-26 17:03:12,HOU,13.0,DAL,27.0,DAL
2010-10-10 17:03:05,HOU,10.0,NYG,34.0,NYG
2010-10-17 17:03:31,HOU,35.0,KC,31.0,HOU


## display null values

In [16]:
df.apply(lambda x: sum(x.isnull()))

home.alias             0
scoring.home_points    0
away.alias             0
scoring.away_points    0
WIN                    0
dtype: int64

## save data

In [17]:
df.to_csv(data_directory_saves+'NFL_2010_2017.csv')