# Clean Data
Authors: Connor Finn, Riley Greene <br>
Date: March 2, 2020


At this point, we have collected data on all the teams which competed in our list of NCAA tournament baskeball games.
Now, it is important that we reorganize this data into a useful framework for machine learning analysis.

## Schema
We would like to reorganize our data into a framework which resembles the shown schema

In [178]:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

image = mpimg.imread("schema.png")
plt.imshow(image)
plt.show()

<IPython.core.display.Javascript object>

## Read in our data

In [179]:
import pandas as pd
season_data = pd.read_csv("collected_data/season_data.csv")
more_team_data = pd.read_csv("collected_data/more_team_data.csv")
ncaa_short = pd.read_csv("collected_data/ncaa_short.csv")

### We can first get a sense of what the data looks like. 
* Season_data is a data set of statistics for a given team
    + teams are distinguished by the given id and the year
    + the id is unique to school but not season


In [180]:
season_data.describe()

Unnamed: 0,G,MP,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,...,TRB,AST,STL,BLK,TOV,PF,PTS,PTS/G,Team_ID,Date
count,674.0,656.0,674.0,674.0,674.0,674.0,674.0,674.0,674.0,674.0,...,674.0,674.0,674.0,674.0,674.0,674.0,674.0,674.0,674.0,674.0
mean,34.847181,7028.358232,903.54451,1974.548961,0.457424,663.152819,1306.97181,0.507782,240.391691,667.577151,...,1242.977745,495.264095,233.535608,137.468843,425.669139,617.350148,2576.28635,73.887537,1294.651335,2013.535608
std,1.841917,368.077935,91.877536,167.085554,0.021913,81.639144,151.012209,0.028389,49.973107,115.942385,...,126.566973,70.451832,47.083998,40.82081,52.668681,66.452783,242.878876,5.139888,105.025456,2.861759
min,30.0,6025.0,671.0,1425.0,0.38,448.0,878.0,0.426,124.0,394.0,...,796.0,304.0,118.0,46.0,258.0,457.0,1926.0,59.3,1103.0,2009.0
25%,34.0,6800.0,837.0,1865.25,0.443,606.25,1209.0,0.488,203.0,580.5,...,1153.5,446.25,201.0,108.0,390.0,571.25,2409.75,70.1,1211.0,2011.0
50%,35.0,7025.0,899.0,1967.5,0.457,657.0,1302.5,0.507,236.0,659.5,...,1243.0,489.5,231.0,134.0,423.0,612.5,2569.0,73.6,1294.5,2014.0
75%,36.0,7231.75,963.75,2077.75,0.472,716.75,1394.75,0.525,273.0,745.75,...,1328.75,540.75,260.75,162.0,458.75,661.0,2738.5,77.4,1391.5,2016.0
max,41.0,8350.0,1235.0,2633.0,0.522,1011.0,1893.0,0.592,464.0,1158.0,...,1748.0,771.0,430.0,344.0,583.0,828.0,3463.0,89.8,1463.0,2018.0


* Like season_data, more_team_data is a data set of statistics for a given team
    + teams are distinguished by the given id and the year
    + the id is unique to school but not season

In [181]:
more_team_data.describe()

Unnamed: 0,Rk,W-L%,W-L%.1,SRS,SOS,PTS,PTS.1,AP Pre,AP High,AP Final,Seed
count,2600.0,2600.0,2590.0,2591.0,2591.0,2599.0,2599.0,301.0,544.0,275.0,736.0
mean,6.493846,0.564278,0.551071,3.34303,1.459128,71.214275,68.439631,13.039867,11.597426,12.989091,8.728261
std,3.451522,0.164415,0.20616,9.648519,5.571433,5.586874,5.240085,7.244198,7.509949,7.224999,4.662143
min,1.0,0.065,0.0,-29.5,-12.71,52.6,51.2,1.0,1.0,1.0,1.0
25%,3.0,0.455,0.389,-3.695,-3.165,67.4,64.9,7.0,5.0,7.0,5.0
50%,6.0,0.576,0.556,3.31,1.05,71.3,68.3,13.0,11.0,13.0,9.0
75%,9.0,0.686,0.714,10.7,6.675,75.0,71.9,19.0,18.0,19.0,13.0
max,12.0,0.974,1.0,28.72,12.79,89.8,90.9,25.0,25.0,25.0,16.0


* ncaa_short is a dataframe of games which occured in the ncaa tournament
    + games have a Winning team ID, and a losing team ID, and a Season
    + These Id's are unique to school, but not season.

In [182]:
ncaa_short.describe()

Unnamed: 0.1,Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT
count,664.0,664.0,664.0,664.0,664.0,664.0,664.0,664.0
mean,1851.5,2013.536145,138.993976,1291.263554,74.88253,1295.033133,63.480422,0.057229
std,191.824573,2.861597,4.225181,101.213936,10.516441,105.058838,10.066127,0.268579
min,1520.0,2009.0,134.0,1104.0,47.0,1103.0,34.0,0.0
25%,1685.75,2011.0,136.0,1211.0,68.0,1211.0,57.0,0.0
50%,1851.5,2014.0,137.0,1277.0,75.0,1295.0,63.0,0.0
75%,2017.25,2016.0,139.0,1387.0,81.25,1392.25,70.0,0.0
max,2183.0,2018.0,154.0,1463.0,105.0,1463.0,96.0,2.0


### Start cleaning the data.
* Fist step is to merge more_team_data and season_data on the school, and the year

In [183]:
# Prepare the dataframes for the merge
more_team_data = more_team_data.rename( columns = { "Season": "Date" , "team": 'Team' })
more_team_data['Date'] = more_team_data['Date'].map(lambda x: int(x[:2] + x[5:7]) )
season_data["Date"] = season_data["Date"].map(lambda x: int(x))

In [184]:
# execute the merge
season_stats = pd.merge(season_data, more_team_data, on=[ 'Team' , 'Date' ] , how = 'left')

In [185]:
# rename the team_id to school_id
season_stats = season_stats.rename( columns = {'Team_ID': 'school_id'})

#### Generate a new, team_id: unique to year and school

In [186]:
initial_value = 100001
season_stats['team_id'] = range(initial_value, len(season_stats) +initial_value)


#### Create the Teams DataFrame

In [187]:
teams = season_stats[["team_id" , 'school_id' , "Date"]].copy()

In [188]:
teams.head()

Unnamed: 0,team_id,school_id,Date
0,100001,1287.0,2009
1,100002,1163.0,2009
2,100003,1181.0,2009
3,100004,1211.0,2009
4,100005,1261.0,2009


#### Create the Schools DataFrame
At this point, we decided that the school_id and sports-reference-name was sufficient.

In [189]:
schools = season_stats[['school_id' , 'Team'  ]]

In [190]:
# This will get it closer to the Schema
schools = schools.rename(columns = {'Team': "sports_reference_name"})
schools.head()

Unnamed: 0,school_id,sports_reference_name
0,1287.0,morehead-state
1,1163.0,connecticut
2,1181.0,duke
3,1211.0,gonzaga
4,1261.0,louisiana-state


#### Create the ncaa_games DataFrame

In [191]:
# Get the winner TeamID
ncaa_games  = pd.merge(ncaa_short, teams, left_on=[ 'WTeamID' , 'Season' ] , right_on=[ "school_id", "Date" ] ,how = 'left')
ncaa_games = ncaa_games.drop(['WTeamID' , 'school_id' , "WLoc" , "NumOT"] , axis = 1)
ncaa_games = ncaa_games.rename(columns = {'team_id': "team_1_id"})


In [192]:
# Get the Loser Team ID
ncaa_games  = pd.merge(ncaa_games, teams, left_on=[ 'LTeamID' , 'Season' ] , right_on=[ "school_id", "Date" ] ,how = 'left')
ncaa_games = ncaa_games.drop(['LTeamID' , 'school_id' , "Date_x" , "Date_y" , "Season" , "Unnamed: 0" , "DayNum"] , axis = 1)
ncaa_games = ncaa_games.rename(columns = {'team_id': "team_2_id" , "WScore": "team_1_score" , "LScore": "team_2_score"})


In [193]:
# create the unique game_id
initial_value = 1
ncaa_games['game_id'] = range(initial_value, len(ncaa_games) +initial_value)
ncaa_games.head()

Unnamed: 0,team_1_score,team_2_score,team_1_id,team_2_id,game_id
0,58,43,100001,100034,1
1,103,47,100002,100035,2
2,86,62,100003,100036,3
3,77,64,100004,100037,4
4,75,71,100005,100038,5


#### Finalize the season_stats Dataframe

In [194]:
season_stats.columns

Index(['Team', 'G', 'MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA',
       '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS_x', 'PTS/G', 'school_id', 'Date', 'Rk', 'Conf', 'W',
       'L', 'W-L%', 'W.1', 'L.1', 'W-L%.1', 'SRS', 'SOS', 'PTS_y', 'PTS.1',
       'AP Pre', 'AP High', 'AP Final', 'NCAA Tournament', 'Seed', 'Coach(es)',
       'Unnamed: 19', 'team_id'],
      dtype='object')

In [195]:
# Lets only keep the numerical columns we care about
cols_keep = ['team_id', 'FG' , 'FGA', 'FG%', '2P' , '2PA', '2P%', '3P' , '3PA', '3P%', 'FT' , 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB' , 'AST', 'STL', 'BLK', 'TOV', 'PF',  'PTS/G','W-L%', 'SRS', 'SOS','PTS.1','Seed' ]
season_stats = season_stats[cols_keep]
season_stats = season_stats.rename(columns = {"PTS.1": "opp_PPG"})

season_stats.head()

Unnamed: 0,team_id,FG,FGA,FG%,2P,2PA,2P%,3P,3PA,3P%,...,STL,BLK,TOV,PF,PTS/G,W-L%,SRS,SOS,opp_PPG,Seed
0,100001,865,1978,0.437,677,1413,0.479,188,565,0.333,...,239,139,557,657,69.8,0.556,-3.14,-3.61,67.0,16.0
1,100002,996,2102,0.474,831,1618,0.514,165,484,0.341,...,209,280,458,471,78.3,0.861,22.17,8.57,64.3,1.0
2,100003,971,2186,0.444,704,1420,0.496,267,766,0.349,...,311,146,454,671,77.5,0.811,20.1,9.07,65.9,2.0
3,100004,967,1982,0.488,716,1339,0.535,251,643,0.39,...,246,174,386,551,78.9,0.824,17.95,3.55,63.0,4.0
4,100005,937,2093,0.448,724,1521,0.476,213,572,0.372,...,267,212,425,609,74.8,0.771,11.87,3.27,66.2,8.0


# At this point, we should have the schema

### Do we have missing Data?


In [196]:
def check_null(df):
    """
    This function takes in a df as an argument, and returns the null values by column
    """
    null_columns=df.columns[df.isnull().any()]
    return df[null_columns].isnull().sum()
    

In [197]:
check_null(ncaa_games)

Series([], dtype: float64)

In [198]:
check_null(season_stats)

Series([], dtype: float64)

In [199]:
check_null(teams)

Series([], dtype: float64)

In [200]:
check_null(schools)

Series([], dtype: float64)

There is no missing data in our cleaned dataframes.

### Shuffle team1 and team2 in the ncaa_games df to eliminate selection bias

As of now, team_1 will always win. We don't want any inherant biases in our data set, so we will randomly shuffle team_1 and team_2

In [201]:
%%capture
def shuffle(row):
    """
    This function will randomly shuffle which team 
    is team 1.  Origionally, the winner is always team 1
    """
    from random import choice
    val = choice([0 , 1])
    if val ==0:
        # everything stays the same
        pass
    else:
        # switch the teams
        ph1 = row["team_1_id"]
        ph2 = row["team_1_score"]
        row["team_1_score"] = row["team_2_score"] 
        row["team_1_id"] = row["team_2_id"]
        row["team_2_id"] = ph1
        row["team_2_score"] = ph2
        return row

ncaa_games.apply(shuffle , axis = 1 )

Save our data for future use

In [202]:
import os

# make folder
if not os.path.exists('./clean_data'):
    os.mkdir('./clean_data')

# write our df's
season_stats.to_csv('clean_data/season_stats.csv' , index = False)
teams.to_csv('clean_data/teams.csv' , index = False)
schools.to_csv('clean_data/schools.csv' , index = False)
ncaa_games.to_csv('clean_data/ncaa_games.csv' , index = False)