## Source

#### sports-reference.com
https://www.sports-reference.com/cfb/schools/rice/2017-schedule.html

- games to csv files from 2008 - 2017
- `RU08_17.csv`

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = 'data/NCAA/RU08_17.csv'

In [3]:
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,G,Date,Time,Day,School,Unnamed: 5,Opponent,Conf,Unnamed: 8,Pts,Opp,W,L,Streak,Notes,Unnamed: 15
0,1,Aug 29 2008,,Fri,Rice,,SMU,CUSA,W,56,27,1,0,W 1,,
1,2,Sep 6 2008,,Sat,Rice,@,Memphis,CUSA,W,42,35,2,0,W 2,,
2,3,Sep 13 2008,,Sat,Rice,@,Vanderbilt,SEC,L,21,38,2,1,L 1,,
3,4,Sep 20 2008,,Sat,Rice,@,(7) Texas,Big 12,L,10,52,2,2,L 2,,
4,5,Sep 27 2008,,Sat,Rice,,North Texas,Sun Belt,W,77,20,3,2,W 1,,


## Slice cols

In [5]:
rice = df[['Date','Day','School','Pts','Unnamed: 5','Opponent','Opp','Unnamed: 8']]

In [6]:
rice.head()

Unnamed: 0,Date,Day,School,Pts,Unnamed: 5,Opponent,Opp,Unnamed: 8
0,Aug 29 2008,Fri,Rice,56,,SMU,27,W
1,Sep 6 2008,Sat,Rice,42,@,Memphis,35,W
2,Sep 13 2008,Sat,Rice,21,@,Vanderbilt,38,L
3,Sep 20 2008,Sat,Rice,10,@,(7) Texas,52,L
4,Sep 27 2008,Sat,Rice,77,,North Texas,20,W


## Check for missing vals

In [8]:
rice.apply(lambda x: sum(x.isnull()))

Date           0
Day            0
School         0
Pts            0
Unnamed: 5    58
Opponent       0
Opp            0
Unnamed: 8     0
dtype: int64

## Cleanup

In [10]:
rice.School.unique()

array(['Rice'], dtype=object)

## rename column
- Unnamed: 5 : win_lost

In [11]:
rice.rename(columns={'Unnamed: 8':'win_lost',
                  'Unnamed: 5':'home_away'}, inplace=True)

## Change values of '@` to 'Away
- create boolean mask to find rows whose value is '@`m

In [12]:
rice['home_away'].unique()

array([nan, '@', 'N'], dtype=object)

In [13]:
rice['home_away'] = rice['home_away'].replace('@','away')

In [14]:
rice['home_away'] = rice['home_away'].fillna('home')

In [15]:
rice['home_away'].value_counts()

away    61
home    58
N        6
Name: home_away, dtype: int64

## Show values of 'N'

- create boolean mask to find rows whose value is 'N

In [16]:
n_val = rice['home_away']  =='N'
rice[n_val]

Unnamed: 0,Date,Day,School,Pts,home_away,Opponent,Opp,win_lost
53,Sep 29 2012,Sat,Rice,14,N,Houston,35,L
61,Dec 29 2012,Sat,Rice,33,N,Air Force,14,W
64,Sep 21 2013,Sat,Rice,26,N,Houston,31,L
75,Dec 31 2013,Tue,Rice,7,N,Mississippi State,44,L
88,Dec 24 2014,Wed,Rice,30,N,Fresno State,6,W
113,Aug 26 2017,Sat,Rice,7,N,(14) Stanford,62,L


### They seem to be all away games 

In [17]:
rice['home_away'] = rice['home_away'].replace('N','away')

In [18]:
rice['home_away'].value_counts(dropna=False)

away    67
home    58
Name: home_away, dtype: int64

## Cleanup Opponent column

In [20]:
rice['Opponent'].unique()

array(['SMU', 'Memphis', 'Vanderbilt', '(7) Texas', 'North Texas',
       'Tulsa', 'Southern Mississippi', 'Tulane', 'UTEP', 'Army',
       'Marshall', 'Houston', 'Western Michigan', 'UAB', 'Texas Tech',
       '(16) Oklahoma State', 'Navy', 'East Carolina', 'UCF',
       '(25) Houston', '(5) Texas', 'Northwestern', 'Baylor', 'Texas',
       'Purdue', '(17) Baylor', '(18) Houston', 'UCLA', 'Kansas',
       'Louisiana Tech', 'UTSA', 'Air Force', '(7) Texas A&M',
       'Florida Atlantic', 'New Mexico State', 'Mississippi State',
       '(17) Notre Dame', 'Old Dominion', 'Hawaii',
       'Florida International', '(21) Marshall', 'Fresno State', 'Wagner',
       '(5) Baylor', 'Western Kentucky', 'Charlotte', '(21) Baylor',
       'Prairie View A&M', 'Stanford', '(14) Stanford', 'Pitt'],
      dtype=object)

## Remove parenthesis from `uh['Opponent']`


In [21]:
rice['Opponent']= rice['Opponent'].str.replace(r"\(.*\)"," ")
rice['Opponent']= rice['Opponent'].str.strip()

In [22]:
rice['Opponent'].unique()

array(['SMU', 'Memphis', 'Vanderbilt', 'Texas', 'North Texas', 'Tulsa',
       'Southern Mississippi', 'Tulane', 'UTEP', 'Army', 'Marshall',
       'Houston', 'Western Michigan', 'UAB', 'Texas Tech',
       'Oklahoma State', 'Navy', 'East Carolina', 'UCF', 'Northwestern',
       'Baylor', 'Purdue', 'UCLA', 'Kansas', 'Louisiana Tech', 'UTSA',
       'Air Force', 'Texas A&M', 'Florida Atlantic', 'New Mexico State',
       'Mississippi State', 'Notre Dame', 'Old Dominion', 'Hawaii',
       'Florida International', 'Fresno State', 'Wagner',
       'Western Kentucky', 'Charlotte', 'Prairie View A&M', 'Stanford',
       'Pitt'], dtype=object)

## convert Date to datetime

In [23]:
# date
rice.Date = pd.to_datetime(rice.Date)

rice = rice.set_index('Date').sort_index(ascending=True)

In [24]:
rice.head()

Unnamed: 0_level_0,Day,School,Pts,home_away,Opponent,Opp,win_lost
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-08-29,Fri,Rice,56,home,SMU,27,W
2008-09-06,Sat,Rice,42,away,Memphis,35,W
2008-09-13,Sat,Rice,21,away,Vanderbilt,38,L
2008-09-20,Sat,Rice,10,away,Texas,52,L
2008-09-27,Sat,Rice,77,home,North Texas,20,W


## select only home games

In [25]:
rice = rice[['School','Pts','home_away','Opponent','Opp', 'win_lost']]
rice = rice[rice.home_away == 'home']

In [26]:
rice.head()

Unnamed: 0_level_0,School,Pts,home_away,Opponent,Opp,win_lost
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-08-29,Rice,56,home,SMU,27,W
2008-09-27,Rice,77,home,North Texas,20,W
2008-10-18,Rice,45,home,Southern Mississippi,40,W
2008-11-08,Rice,38,home,Army,31,W
2008-11-22,Rice,35,home,Marshall,10,W


## only games from 2010 to 2017

In [27]:
rice = rice[rice.index.year>=2010]

In [28]:
rice.to_csv('data/clean/Rice_clean.csv')

In [29]:
ls data/clean

[0m[01;32mRice_clean.csv[0m*  [01;32mUH_clean.csv[0m*
