In [1]:
import pandas as pd
import numpy as np
import glob, os
import re

## Location

In [2]:
ls sports_data/NCAA/

[0m[01;32mRU08_17.csv[0m*  [01;32mtsu08_11.csv[0m*  [01;32mtsu12_17.csv[0m*  [01;32muh08_17.csv[0m*


## Load Data

In [3]:
path = 'sports_data/NCAA/uh08_17.csv'
df = pd.read_csv(path)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 16 columns):
G             130 non-null int64
Date          130 non-null object
Time          65 non-null object
Day           130 non-null object
School        130 non-null object
Unnamed: 5    69 non-null object
Opponent      130 non-null object
Conf          130 non-null object
Unnamed: 8    130 non-null object
Pts           130 non-null int64
Opp           130 non-null int64
W             130 non-null int64
L             130 non-null int64
Streak        130 non-null object
TV            31 non-null object
Notes         6 non-null object
dtypes: int64(5), object(11)
memory usage: 16.3+ KB


In [5]:
df.head()

Unnamed: 0,G,Date,Time,Day,School,Unnamed: 5,Opponent,Conf,Unnamed: 8,Pts,Opp,W,L,Streak,TV,Notes
0,1,Sep 9 2017,10:30 PM,Sat,Houston,@,Arizona,Pac-12,W,19,16,1,0,W 1,,
1,2,Sep 16 2017,8:00 PM,Sat,Houston,,Rice,CUSA,W,38,3,2,0,W 2,,
2,3,Sep 23 2017,12:00 PM,Sat,Houston,,Texas Tech,Big 12,L,24,27,2,1,L 1,,
3,4,Sep 30 2017,12:00 PM,Sat,Houston,@,Temple,American,W,20,13,3,1,W 1,,
4,5,Oct 7 2017,7:00 PM,Sat,Houston,,SMU,American,W,35,22,4,1,W 2,,


In [6]:
df['Notes'].unique()

array([nan, 'Hawaii Bowl', 'Las Vegas Bowl', 'American Championship Game',
       'Peach Bowl (Atlanta GA)', 'Paul Brown Stadium - Cincinnati OH',
       'Armed Forces Bowl (Fort Worth TX)'], dtype=object)

## Create slice with only needed columns

In [7]:
uh = df[['Date','Day','School','Pts','Unnamed: 5','Opponent','Opp','Unnamed: 8','Notes']]

In [8]:
uh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 9 columns):
Date          130 non-null object
Day           130 non-null object
School        130 non-null object
Pts           130 non-null int64
Unnamed: 5    69 non-null object
Opponent      130 non-null object
Opp           130 non-null int64
Unnamed: 8    130 non-null object
Notes         6 non-null object
dtypes: int64(2), object(7)
memory usage: 9.2+ KB


## Inspect data

In [10]:
uh.School.unique()  # need fix

array(['Houston', '(15) Houston', '(6) Houston', '(13) Houston',
       '(11) Houston', '(18) Houston', '(24) Houston', '(21) Houston',
       '(16) Houston', '(17) Houston', '(14) Houston', '(8) Houston',
       '(7) Houston', '(20) Houston', '(23) Houston', '(12) Houston',
       '(25) Houston'], dtype=object)

In [12]:
uh['Unnamed: 8'].unique()  # ok

array(['W', 'L'], dtype=object)

In [14]:
uh['Unnamed: 5'].unique()  # need fix

array(['@', nan, 'N'], dtype=object)

In [16]:
uh['Opponent'].unique()  # need fix

array(['Arizona', 'Rice', 'Texas Tech', 'Temple', 'SMU', 'Tulsa',
       '(25) Memphis', '(17) South Florida', 'East Carolina', 'Tulane',
       'Navy', 'Fresno State', '(3) Oklahoma', 'Lamar', 'Cincinnati',
       'Texas State', 'Connecticut', 'UCF', '(3) Louisville', 'Memphis',
       'San Diego State', 'Tennessee Tech', 'Louisville', 'Vanderbilt',
       '(16) Navy', '(20) Temple', '(9) Florida State', 'UTSA',
       'Grambling State', '(25) Brigham Young', 'Nevada-Las Vegas',
       'South Florida', 'Pitt', 'Southern', 'Brigham Young', 'Rutgers',
       '(19) UCF', '(19) Louisville', 'Louisiana Tech', '(22) UCLA',
       'North Texas', 'UAB', 'UTEP', 'Marshall', 'UCLA', 'Georgia State',
       '(24) Southern Mississippi', '(24) Penn State',
       'Mississippi State', 'Southern Mississippi', 'Northwestern State',
       '(5) Oklahoma State', 'Air Force', 'Oklahoma State',
       'Colorado State', '(23) East Carolina', '(25) Tulsa'], dtype=object)

## Change School values to just Houston

In [17]:
uh['School'].value_counts()

Houston         93
(6) Houston      5
(18) Houston     5
(13) Houston     4
(11) Houston     3
(21) Houston     3
(17) Houston     3
(24) Houston     2
(23) Houston     2
(14) Houston     2
(15) Houston     2
(8) Houston      1
(25) Houston     1
(16) Houston     1
(20) Houston     1
(7) Houston      1
(12) Houston     1
Name: School, dtype: int64

In [18]:
uh['School'] = 'Houston'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
uh['School'].value_counts()

Houston    130
Name: School, dtype: int64

## Rename columns
- Unnamed: 5 : win_lost

In [20]:
uh.rename(columns={'Unnamed: 8':'win_lost',
                  'Unnamed: 5':'home_away'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [21]:
uh.head()

Unnamed: 0,Date,Day,School,Pts,home_away,Opponent,Opp,win_lost,Notes
0,Sep 9 2017,Sat,Houston,19,@,Arizona,16,W,
1,Sep 16 2017,Sat,Houston,38,,Rice,3,W,
2,Sep 23 2017,Sat,Houston,24,,Texas Tech,27,L,
3,Sep 30 2017,Sat,Houston,20,@,Temple,13,W,
4,Oct 7 2017,Sat,Houston,35,,SMU,22,W,


## Change column
- change home_away values to  home or away

In [None]:
#uh['home_away'] = np.where(uh['home_away'] =='@', 'AWAY', 'HOME')

In [22]:
uh['home_away'].unique()

array(['@', nan, 'N'], dtype=object)

## Change values of '@` to 'Away
- create boolean mask to find rows whose value is '@`m

In [26]:
uh['home_away'] = uh['home_away'].replace('@','away')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [27]:
uh['home_away'] = uh['home_away'].fillna('home')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
uh['home_away'].value_counts()

home    61
away    53
N       16
Name: home_away, dtype: int64

## Show values of 'N'

- create boolean mask to find rows whose value is 'N

In [30]:
n_val = uh['home_away']  =='N'
uh[n_val]

Unnamed: 0,Date,Day,School,Pts,home_away,Opponent,Opp,win_lost,Notes
11,Dec 24 2017,Sun,Houston,27,N,Fresno State,33,L,Hawaii Bowl
12,Sep 3 2016,Sat,Houston,33,N,(3) Oklahoma,23,W,
24,Dec 17 2016,Sat,Houston,10,N,San Diego State,34,L,Las Vegas Bowl
38,Dec 31 2015,Thu,Houston,38,N,(9) Florida State,24,W,Peach Bowl (Atlanta GA)
50,Dec 6 2014,Sat,Houston,31,N,Cincinnati,38,L,Paul Brown Stadium - Cincinnati OH
51,Jan 2 2015,Fri,Houston,35,N,Pitt,34,W,Armed Forces Bowl (Fort Worth TX)
54,Sep 21 2013,Sat,Houston,31,N,Rice,26,W,
56,Oct 12 2013,Sat,Houston,25,N,Memphis,15,W,
57,Oct 19 2013,Sat,Houston,46,N,Brigham Young,47,L,
62,Nov 23 2013,Sat,Houston,17,N,Cincinnati,24,L,


### They seem to be all away games 

In [31]:
uh['home_away'] = uh['home_away'].replace('N','away')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
uh['home_away'].value_counts()

away    69
home    61
Name: home_away, dtype: int64

## Clean up some columns

In [33]:
uh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 9 columns):
Date         130 non-null object
Day          130 non-null object
School       130 non-null object
Pts          130 non-null int64
home_away    130 non-null object
Opponent     130 non-null object
Opp          130 non-null int64
win_lost     130 non-null object
Notes        6 non-null object
dtypes: int64(2), object(7)
memory usage: 9.2+ KB


In [37]:
uh['Opponent'].unique()

array(['Arizona', 'Rice', 'Texas Tech', 'Temple', 'SMU', 'Tulsa',
       '(25) Memphis', '(17) South Florida', 'East Carolina', 'Tulane',
       'Navy', 'Fresno State', '(3) Oklahoma', 'Lamar', 'Cincinnati',
       'Texas State', 'Connecticut', 'UCF', '(3) Louisville', 'Memphis',
       'San Diego State', 'Tennessee Tech', 'Louisville', 'Vanderbilt',
       '(16) Navy', '(20) Temple', '(9) Florida State', 'UTSA',
       'Grambling State', '(25) Brigham Young', 'Nevada-Las Vegas',
       'South Florida', 'Pitt', 'Southern', 'Brigham Young', 'Rutgers',
       '(19) UCF', '(19) Louisville', 'Louisiana Tech', '(22) UCLA',
       'North Texas', 'UAB', 'UTEP', 'Marshall', 'UCLA', 'Georgia State',
       '(24) Southern Mississippi', '(24) Penn State',
       'Mississippi State', 'Southern Mississippi', 'Northwestern State',
       '(5) Oklahoma State', 'Air Force', 'Oklahoma State',
       'Colorado State', '(23) East Carolina', '(25) Tulsa'], dtype=object)

## Remove parenthesis from `uh['Opponent']`

In [38]:
uh['Opponent']= uh['Opponent'].str.replace(r"\(.*\)"," ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
uh['Opponent'].unique()

array(['Arizona', 'Rice', 'Texas Tech', 'Temple', 'SMU', 'Tulsa',
       '  Memphis', '  South Florida', 'East Carolina', 'Tulane', 'Navy',
       'Fresno State', '  Oklahoma', 'Lamar', 'Cincinnati', 'Texas State',
       'Connecticut', 'UCF', '  Louisville', 'Memphis', 'San Diego State',
       'Tennessee Tech', 'Louisville', 'Vanderbilt', '  Navy', '  Temple',
       '  Florida State', 'UTSA', 'Grambling State', '  Brigham Young',
       'Nevada-Las Vegas', 'South Florida', 'Pitt', 'Southern',
       'Brigham Young', 'Rutgers', '  UCF', 'Louisiana Tech', '  UCLA',
       'North Texas', 'UAB', 'UTEP', 'Marshall', 'UCLA', 'Georgia State',
       '  Southern Mississippi', '  Penn State', 'Mississippi State',
       'Southern Mississippi', 'Northwestern State', '  Oklahoma State',
       'Air Force', 'Oklahoma State', 'Colorado State', '  East Carolina',
       '  Tulsa'], dtype=object)

## Remove extra spaces

In [40]:
uh['Opponent'] = uh['Opponent'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
uh['Opponent'].unique()

array(['Arizona', 'Rice', 'Texas Tech', 'Temple', 'SMU', 'Tulsa',
       'Memphis', 'South Florida', 'East Carolina', 'Tulane', 'Navy',
       'Fresno State', 'Oklahoma', 'Lamar', 'Cincinnati', 'Texas State',
       'Connecticut', 'UCF', 'Louisville', 'San Diego State',
       'Tennessee Tech', 'Vanderbilt', 'Florida State', 'UTSA',
       'Grambling State', 'Brigham Young', 'Nevada-Las Vegas', 'Pitt',
       'Southern', 'Rutgers', 'Louisiana Tech', 'UCLA', 'North Texas',
       'UAB', 'UTEP', 'Marshall', 'Georgia State', 'Southern Mississippi',
       'Penn State', 'Mississippi State', 'Northwestern State',
       'Oklahoma State', 'Air Force', 'Colorado State'], dtype=object)

## convert Date to datetime

In [42]:
uh['Date']= pd.to_datetime(uh['Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
uh.head()

Unnamed: 0,Date,Day,School,Pts,home_away,Opponent,Opp,win_lost,Notes
0,2017-09-09,Sat,Houston,19,away,Arizona,16,W,
1,2017-09-16,Sat,Houston,38,home,Rice,3,W,
2,2017-09-23,Sat,Houston,24,home,Texas Tech,27,L,
3,2017-09-30,Sat,Houston,20,away,Temple,13,W,
4,2017-10-07,Sat,Houston,35,home,SMU,22,W,


## Reindex dataframe by date

- reindex the dataframe and sort the dates

In [44]:
uh = uh.set_index('Date').sort_index(ascending=True)

In [45]:
uh.head()

Unnamed: 0_level_0,Day,School,Pts,home_away,Opponent,Opp,win_lost,Notes
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2008-08-30,Sat,Houston,55,home,Southern,3,W,
2008-09-06,Sat,Houston,37,away,Oklahoma State,56,L,
2008-09-13,Sat,Houston,28,home,Air Force,31,L,
2008-09-20,Sat,Houston,25,away,Colorado State,28,L,
2008-09-27,Sat,Houston,41,away,East Carolina,24,W,


## Save clean data to  to csv

In [46]:
uh.to_csv('clean_data/uh_08_17_clean.csv')

In [47]:
ls clean_data/

[0m[01;32mastros_08_17_clean.csv[0m*  [01;32mrice_08_17_clean.csv[0m*  [01;32muh_08_17_clean.csv[0m*


In [48]:
##Finish