In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import glob, os
import numpy as np
import matplotlib.pyplot as plt
import datetime  as dt
import seaborn as sns

## Data Directory

In [3]:
sports = 'NCAA'

In [4]:
data_directory = os.path.join('..','data','sports_data/{}/RU08_17.csv'.format(sports))
data_directory_saves = os.path.join( '..','data','clean_data','sports_data/{}/'.format(sports))

In [5]:
df = pd.read_csv(data_directory)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 16 columns):
G              125 non-null int64
Date           125 non-null object
Time           63 non-null object
Day            125 non-null object
School         125 non-null object
Unnamed: 5     67 non-null object
Opponent       125 non-null object
Conf           125 non-null object
Unnamed: 8     125 non-null object
Pts            125 non-null int64
Opp            125 non-null int64
W              125 non-null int64
L              125 non-null int64
Streak         125 non-null object
Notes          26 non-null object
Unnamed: 15    1 non-null object
dtypes: int64(5), object(11)
memory usage: 15.7+ KB


In [7]:
df.head()

Unnamed: 0,G,Date,Time,Day,School,Unnamed: 5,Opponent,Conf,Unnamed: 8,Pts,Opp,W,L,Streak,Notes,Unnamed: 15
0,1,Aug 29 2008,,Fri,Rice,,SMU,CUSA,W,56,27,1,0,W 1,,
1,2,Sep 6 2008,,Sat,Rice,@,Memphis,CUSA,W,42,35,2,0,W 2,,
2,3,Sep 13 2008,,Sat,Rice,@,Vanderbilt,SEC,L,21,38,2,1,L 1,,
3,4,Sep 20 2008,,Sat,Rice,@,(7) Texas,Big 12,L,10,52,2,2,L 2,,
4,5,Sep 27 2008,,Sat,Rice,,North Texas,Sun Belt,W,77,20,3,2,W 1,,


## Drop columns

In [8]:
df

Unnamed: 0,G,Date,Time,Day,School,Unnamed: 5,Opponent,Conf,Unnamed: 8,Pts,Opp,W,L,Streak,Notes,Unnamed: 15
0,1,Aug 29 2008,,Fri,Rice,,SMU,CUSA,W,56,27,1,0,W 1,,
1,2,Sep 6 2008,,Sat,Rice,@,Memphis,CUSA,W,42,35,2,0,W 2,,
2,3,Sep 13 2008,,Sat,Rice,@,Vanderbilt,SEC,L,21,38,2,1,L 1,,
3,4,Sep 20 2008,,Sat,Rice,@,(7) Texas,Big 12,L,10,52,2,2,L 2,,
4,5,Sep 27 2008,,Sat,Rice,,North Texas,Sun Belt,W,77,20,3,2,W 1,,
5,6,Oct 4 2008,,Sat,Rice,@,Tulsa,CUSA,L,28,63,3,3,L 1,,
6,7,Oct 18 2008,,Sat,Rice,,Southern Mississippi,CUSA,W,45,40,4,3,W 1,,
7,8,Oct 25 2008,,Sat,Rice,@,Tulane,CUSA,W,42,17,5,3,W 2,,
8,9,Nov 1 2008,,Sat,Rice,@,UTEP,CUSA,W,49,44,6,3,W 3,,
9,10,Nov 8 2008,,Sat,Rice,,Army,Ind,W,38,31,7,3,W 4,,


In [9]:
df.drop(['G','Time',
 'Day',
 'Conf',
 'W',
 'L',
 'Streak',
 'Notes',
 'Unnamed: 15'],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,Date,School,Unnamed: 5,Opponent,Unnamed: 8,Pts,Opp
0,Aug 29 2008,Rice,,SMU,W,56,27
1,Sep 6 2008,Rice,@,Memphis,W,42,35
2,Sep 13 2008,Rice,@,Vanderbilt,L,21,38
3,Sep 20 2008,Rice,@,(7) Texas,L,10,52
4,Sep 27 2008,Rice,,North Texas,W,77,20


## change date to datetime index

In [11]:
# Map the lowering function to all column names
df.columns = map(str.lower, df.columns)

In [12]:
df.date = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index(ascending=True)

In [13]:
df.head()

Unnamed: 0_level_0,school,unnamed: 5,opponent,unnamed: 8,pts,opp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-08-29,Rice,,SMU,W,56,27
2008-09-06,Rice,@,Memphis,W,42,35
2008-09-13,Rice,@,Vanderbilt,L,21,38
2008-09-20,Rice,@,(7) Texas,L,10,52
2008-09-27,Rice,,North Texas,W,77,20


## chane unnamed:5 to home_away

In [14]:
df.rename(columns={
    'unnamed: 5': 'home_away',
    'unnamed: 8': 'win_lost',
}, inplace=True)

In [15]:
df.home_away.fillna('home',inplace=True)

In [16]:
df.head()

Unnamed: 0_level_0,school,home_away,opponent,win_lost,pts,opp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-08-29,Rice,home,SMU,W,56,27
2008-09-06,Rice,@,Memphis,W,42,35
2008-09-13,Rice,@,Vanderbilt,L,21,38
2008-09-20,Rice,@,(7) Texas,L,10,52
2008-09-27,Rice,home,North Texas,W,77,20


## select home games

In [17]:
home_games = df.home_away == 'home'
df = df[home_games]
df.drop(['home_away'],axis=1,inplace=True)

In [18]:
df.head()

Unnamed: 0_level_0,school,opponent,win_lost,pts,opp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-08-29,Rice,SMU,W,56,27
2008-09-27,Rice,North Texas,W,77,20
2008-10-18,Rice,Southern Mississippi,W,45,40
2008-11-08,Rice,Army,W,38,31
2008-11-22,Rice,Marshall,W,35,10


## clean columns

In [19]:
df.school.unique()

array(['Rice'], dtype=object)

In [20]:
df.opponent.unique()

array(['SMU', 'North Texas', 'Southern Mississippi', 'Army', 'Marshall',
       'Houston', 'Western Michigan', 'Vanderbilt', 'Tulsa', 'Navy',
       'UCF', 'Tulane', 'UTEP', '(5) Texas', 'Northwestern', 'Baylor',
       'East Carolina', 'UAB', 'Purdue', 'Memphis', 'UCLA', 'UTSA',
       'Kansas', 'Florida Atlantic', 'Louisiana Tech', 'Old Dominion',
       'Hawaii', 'Wagner', 'Western Kentucky', 'Charlotte', '(21) Baylor',
       'Prairie View A&M', 'Florida International'], dtype=object)

## remove extra characters line

In [21]:
df.opponent = df.opponent.str.replace(r"\(.*\)"," ")

In [22]:
df.opponent.unique()

array(['SMU', 'North Texas', 'Southern Mississippi', 'Army', 'Marshall',
       'Houston', 'Western Michigan', 'Vanderbilt', 'Tulsa', 'Navy',
       'UCF', 'Tulane', 'UTEP', '  Texas', 'Northwestern', 'Baylor',
       'East Carolina', 'UAB', 'Purdue', 'Memphis', 'UCLA', 'UTSA',
       'Kansas', 'Florida Atlantic', 'Louisiana Tech', 'Old Dominion',
       'Hawaii', 'Wagner', 'Western Kentucky', 'Charlotte', '  Baylor',
       'Prairie View A&M', 'Florida International'], dtype=object)

In [23]:
df.win_lost.unique()

array(['W', 'L'], dtype=object)

In [24]:
df.pts.unique()

array([56, 77, 45, 38, 35, 17, 10, 14,  7, 28, 30, 13, 31, 34, 62, 24, 20,
       41, 19, 51, 44, 36, 23, 18, 52, 42, 27, 65, 25, 12])

In [25]:
df.opp.unique()

array([27, 20, 40, 31, 10, 42, 14, 36, 63, 49, 29, 34, 30, 38, 23, 22,  6,
       37,  7, 54, 17, 13, 24, 45, 21, 16, 65, 44, 43])

## display null values

In [26]:
df.apply(lambda x: sum(x.isnull()))

school      0
opponent    0
win_lost    0
pts         0
opp         0
dtype: int64

## save dataset

In [27]:
df.to_csv(data_directory_saves+'Rice_2010_2017.csv')