# Clean and Organize data

In [1]:
import os

class bball_pipeline(object):

    def make_folder(self):
        if not os.path.exists('./clean_data'):
            os.mkdir('./clean_data')
    
    def save_file(self):
        pass


In [2]:
# our imports for the model
import pandas as pd
import numpy as np

# Read in our collected Data

In [3]:
# ncaa tournament results into dataframe 
ncaa_short = pd.read_csv('collected_data/ncaa_short.csv')  
# team ID, name, sports reference url name
team_data = pd.read_csv('collected_data/team_list.csv')  
# Season long stats for 2009-2018
season_data = pd.read_csv('collected_data/season_data.csv')
# more team data
season_stats2 = pd.read_csv('collected_data/more_team_data.csv')


In [9]:
ncaa_short

Unnamed: 0.1,Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1520,2009,134,1287,58,1106,43,N,0
1,1521,2009,136,1163,103,1151,47,N,0
2,1522,2009,136,1181,86,1127,62,N,0
3,1523,2009,136,1211,77,1103,64,N,0
4,1524,2009,136,1261,75,1139,71,N,0
5,1525,2009,136,1268,84,1143,71,N,0
6,1526,2009,136,1272,81,1169,70,N,0
7,1527,2009,136,1276,62,1155,59,N,0
8,1528,2009,136,1314,101,1347,58,N,0
9,1529,2009,136,1328,82,1288,54,N,0


# Create the 'teams' dataframe

This will be done in the following steps:
1. create an empty dataframe with the desired column titles.
2. Iterate through the ncaa_short dataframe's rows. These are the past ncaa_tournament games. 
3. For each game, filter the 'teams' dataframe that we are building, to the rows which include the correct year
4. If the schools are not in the filtered 'teams' dataframe, add them, and create a new team_id.


In [31]:
# create the dataframe
teams = pd.DataFrame(columns = ["team_id" , "school_id" , "year"] ) 
# counter 
id_counter = 1000000

# iterate through our data
for i in range(ncaa_short.shape[0]):
    year = ncaa_short.iloc[i , 1] 
    id_1 =  ncaa_short.iloc[i , 3]
    id_2 =  ncaa_short.iloc[i , 5] 
    s = teams["year"] == year              # filter
    if teams.shape[0] == 0:
        teams = teams.append({'team_id': id_counter , 'school_id': id_1 , "year" : year} , ignore_index=True )
        id_counter += 1  # update counter
        teams = teams.append({'team_id': id_counter , 'school_id': id_2 , "year" : year} , ignore_index=True )    
        id_counter += 1
    else:
        if id_1 not in teams[s]:                     # conditionally append
            teams = teams.append({'team_id': id_counter , 'school_id': id_1 , "year" : year} , ignore_index=True )
            id_counter += 1  # update counter
        s = teams["year"] == year
        if  id_2 not in teams[s]:                     # conditionally append
            teams = teams.append({'team_id': id_counter , 'school_id': id_2 , "year" : year} , ignore_index=True )    
            id_counter += 1


Let's first get a sense of what our data looks like

In [4]:
# Get the columns
season_stats2.columns

Index(['Rk', 'Season', 'Conf', 'W', 'L', 'W-L%', 'W.1', 'L.1', 'W-L%.1', 'SRS',
       'SOS', 'PTS', 'PTS.1', 'AP Pre', 'AP High', 'AP Final',
       'NCAA Tournament', 'Seed', 'Coach(es)', 'Unnamed: 19', 'team'],
      dtype='object')

Remove some non helpful datavals

In [5]:
season_stats2 = season_stats2.drop(columns=['Rk', 'Unnamed: 19' , 'AP Pre' , 'AP High' , 'W.1' , 'L.1' , 'W-L%.1',  'AP Final' , 'NCAA Tournament'])

In [6]:
season_stats2.describe(include = 'all')

Unnamed: 0,Season,Conf,W,L,W-L%,SRS,SOS,PTS,PTS.1,Seed,Coach(es),team
count,2600,2600,2600.0,2600.0,2600.0,2591.0,2591.0,2599.0,2599.0,736.0,2600,2600
unique,17,35,50.0,42.0,,,,,,,2488,217
top,2018-19,ACC,20.0,14.0,,,,,,,Sean Miller (27-8),north-carolina-greensboro
freq,217,165,162.0,221.0,,,,,,,3,12
mean,,,,,0.564665,3.337522,1.442721,71.233282,68.418853,8.728261,,
std,,,,,0.165013,9.661552,5.510206,5.602585,5.266216,4.662143,,
min,,,,,0.05,-30.4,-12.71,52.6,50.7,1.0,,
25%,,,,,0.45425,-3.68,-3.11,67.4,64.8,5.0,,
50%,,,,,0.576,3.31,1.04,71.3,68.3,9.0,,
75%,,,,,0.686,10.655,6.565,75.0,71.9,13.0,,


In [7]:
season_stats2.dtypes

Season        object
Conf          object
W             object
L             object
W-L%         float64
SRS          float64
SOS          float64
PTS          float64
PTS.1        float64
Seed         float64
Coach(es)     object
team          object
dtype: object

In [8]:
# have to get rid of the * ( this means the data was changed as a result of a penalty)
season_stats2['W'] = season_stats2['W'].map(lambda x: x.rstrip('*'))
season_stats2['L'] = season_stats2['L'].map(lambda x: x.rstrip('*'))

In [9]:
season_stats2 = season_stats2.astype({'W': 'int32' , 'L': 'int32'})

In [10]:
season_stats2.dtypes

Season        object
Conf          object
W              int32
L              int32
W-L%         float64
SRS          float64
SOS          float64
PTS          float64
PTS.1        float64
Seed         float64
Coach(es)     object
team          object
dtype: object