In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

## Data Directory

In [3]:
sports = 'NCAA'

In [4]:
data_directory = os.path.join('..','data','sports_data/{}/uh08_17.csv'.format(sports))
data_directory_saves = os.path.join( '..','data','clean_data','sports_data/{}/'.format(sports))

In [5]:
df = pd.read_csv(data_directory)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 16 columns):
G             130 non-null int64
Date          130 non-null object
Time          65 non-null object
Day           130 non-null object
School        130 non-null object
Unnamed: 5    69 non-null object
Opponent      130 non-null object
Conf          130 non-null object
Unnamed: 8    130 non-null object
Pts           130 non-null int64
Opp           130 non-null int64
W             130 non-null int64
L             130 non-null int64
Streak        130 non-null object
TV            31 non-null object
Notes         6 non-null object
dtypes: int64(5), object(11)
memory usage: 16.3+ KB


In [7]:
df.head()

Unnamed: 0,G,Date,Time,Day,School,Unnamed: 5,Opponent,Conf,Unnamed: 8,Pts,Opp,W,L,Streak,TV,Notes
0,1,Sep 9 2017,10:30 PM,Sat,Houston,@,Arizona,Pac-12,W,19,16,1,0,W 1,,
1,2,Sep 16 2017,8:00 PM,Sat,Houston,,Rice,CUSA,W,38,3,2,0,W 2,,
2,3,Sep 23 2017,12:00 PM,Sat,Houston,,Texas Tech,Big 12,L,24,27,2,1,L 1,,
3,4,Sep 30 2017,12:00 PM,Sat,Houston,@,Temple,American,W,20,13,3,1,W 1,,
4,5,Oct 7 2017,7:00 PM,Sat,Houston,,SMU,American,W,35,22,4,1,W 2,,


## Drop columns

In [8]:
list(df.columns)

['G',
 'Date',
 'Time',
 'Day',
 'School',
 'Unnamed: 5',
 'Opponent',
 'Conf',
 'Unnamed: 8',
 'Pts',
 'Opp',
 'W',
 'L',
 'Streak',
 'TV',
 'Notes']

In [9]:
df.drop(['G','Time',
 'Day',
 'Conf',
 'W',
 'L',
 'Streak',
 'Notes',
 'TV'],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,Date,School,Unnamed: 5,Opponent,Unnamed: 8,Pts,Opp
0,Sep 9 2017,Houston,@,Arizona,W,19,16
1,Sep 16 2017,Houston,,Rice,W,38,3
2,Sep 23 2017,Houston,,Texas Tech,L,24,27
3,Sep 30 2017,Houston,@,Temple,W,20,13
4,Oct 7 2017,Houston,,SMU,W,35,22


## change date to datetime index

In [11]:
# Map the lowering function to all column names
df.columns = map(str.lower, df.columns)

In [12]:
df.date = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index(ascending=True)

In [13]:
df.head()

Unnamed: 0_level_0,school,unnamed: 5,opponent,unnamed: 8,pts,opp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-08-30,Houston,,Southern,W,55,3
2008-09-06,Houston,@,Oklahoma State,L,37,56
2008-09-13,Houston,,Air Force,L,28,31
2008-09-20,Houston,@,Colorado State,L,25,28
2008-09-27,Houston,@,(23) East Carolina,W,41,24


## chane unnamed:5 to home_away

In [14]:
df.rename(columns={
    'unnamed: 5': 'home_away',
    'unnamed: 8': 'win_lost',
}, inplace=True)

In [15]:
df.home_away.fillna('home',inplace=True)

In [16]:
df.head()

Unnamed: 0_level_0,school,home_away,opponent,win_lost,pts,opp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-08-30,Houston,home,Southern,W,55,3
2008-09-06,Houston,@,Oklahoma State,L,37,56
2008-09-13,Houston,home,Air Force,L,28,31
2008-09-20,Houston,@,Colorado State,L,25,28
2008-09-27,Houston,@,(23) East Carolina,W,41,24


## select home games

In [17]:
home_games = df.home_away == 'home'
df = df[home_games]
df.drop(['home_away'],axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0_level_0,school,opponent,win_lost,pts,opp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-08-30,Houston,Southern,W,55,3
2008-09-13,Houston,Air Force,L,28,31
2008-10-09,Houston,UAB,W,45,20
2008-11-08,Houston,Tulane,W,42,14
2008-11-15,Houston,(25) Tulsa,W,70,30


## clean columns

In [19]:
df.school.unique()

array(['Houston', '(17) Houston', '(15) Houston', '(24) Houston',
       '(25) Houston', '(21) Houston', '(18) Houston', '(11) Houston',
       '(7) Houston', '(16) Houston', '(6) Houston', '(13) Houston'],
      dtype=object)

In [20]:
df.opponent.unique()

array(['Southern', 'Air Force', 'UAB', 'Tulane', '(25) Tulsa', 'UTEP',
       'Northwestern State', 'Texas Tech', 'SMU', 'Southern Mississippi',
       'Memphis', 'Rice', 'Texas State', 'Mississippi State', 'UCF',
       'Tulsa', 'UCLA', 'Georgia State', 'East Carolina', 'Marshall',
       '(24) Southern Mississippi', 'Louisiana Tech', 'North Texas',
       'South Florida', 'UTSA', 'Grambling State', 'Nevada-Las Vegas',
       'Temple', 'Tennessee Tech', 'Vanderbilt', 'Cincinnati',
       '(25) Memphis', '(16) Navy', '(20) Temple', 'Lamar', 'Connecticut',
       '(3) Louisville', 'Navy'], dtype=object)

## remove extra characters line

In [21]:
df.opponent = df.opponent.str.replace(r"\(.*\)"," ")

In [22]:
df.opponent.unique()

array(['Southern', 'Air Force', 'UAB', 'Tulane', '  Tulsa', 'UTEP',
       'Northwestern State', 'Texas Tech', 'SMU', 'Southern Mississippi',
       'Memphis', 'Rice', 'Texas State', 'Mississippi State', 'UCF',
       'Tulsa', 'UCLA', 'Georgia State', 'East Carolina', 'Marshall',
       '  Southern Mississippi', 'Louisiana Tech', 'North Texas',
       'South Florida', 'UTSA', 'Grambling State', 'Nevada-Las Vegas',
       'Temple', 'Tennessee Tech', 'Vanderbilt', 'Cincinnati',
       '  Memphis', '  Navy', '  Temple', 'Lamar', 'Connecticut',
       '  Louisville', 'Navy'], dtype=object)

In [23]:
df.win_lost.unique()

array(['W', 'L'], dtype=object)

In [24]:
df.pts.unique()

array([55, 28, 45, 42, 70, 29, 38, 50, 73, 68, 54, 24, 33, 25, 56, 63, 37,
       13, 49, 44, 39,  7, 40, 62, 35, 47, 12, 31, 52, 59, 34, 30, 36])

In [25]:
df.opp.unique()

array([ 3, 31, 20, 14, 30, 37,  7, 28, 15, 43, 24, 23, 47, 40, 34,  0, 49,
       56, 21, 17, 35, 41, 13, 27, 10, 18, 22, 42])

## display null values

In [26]:
df.apply(lambda x: sum(x.isnull()))

school      0
opponent    0
win_lost    0
pts         0
opp         0
dtype: int64

## save dataset

In [27]:
df.to_csv(data_directory_saves+'Rice_2010_2017.csv')