# Clean and Organize data

Data has been reorganized on a SQL dataframe locally to get it in the newly updated format. All new files are located in prelim_cleaned_data folder in the repository.  
There is still some work to be done here, however. For example, we do not want every single collumn collected, and some will need to be renamed to fit our schema.

In [237]:
import os

class bball_pipeline(object):

    def make_folder(self):
        if not os.path.exists('./clean_data'):
            os.mkdir('./clean_data')
    
    def save_file(self):
        pass


In [238]:
# our imports for the model
import pandas as pd
import numpy as np

## Import our Data

In [239]:
# ncaa tournament results into dataframe 
teams = pd.read_csv('prelim_cleaned_data/distinctteamkey.csv')  
# team ID, name, sports reference url name
ncaa_games = pd.read_csv('prelim_cleaned_data/ncaa_new.csv')  
# Season long stats for 2009-2018
schools = pd.read_csv('prelim_cleaned_data/team_list_new.csv')
# more team data
season_stats = pd.read_csv('prelim_cleaned_data/big_stats.csv')

## Rename our Columns According to the Schema

The Dataframes we are reconfiguring here are: <dir>
* teams
* schools
* ncaa_games

In [240]:
# simply rename the columns for teams and schools
teams.columns = ["team_id" , "school_id" , "year"]
schools.columns = ["school_id" , "school_name" , "sports_reference_name" ]

In [241]:
# reorder the columns
ncaa_games = ncaa_games.reindex(columns=['Identifier','Winner','Loser','WScore','LScore','DayNum'])
# rename the columns according to the schema
ncaa_games.columns = [ "game_id" , "team1_id" , 'team2_id' , 'team1_score' , 'team2_score' , 'DayNum']
# Drop the DayNum column
ncaa_games = ncaa_games.drop(columns = "DayNum")

## Reorganize the season_stats DataFrame

This takes three steps <dir>
    1. remove columns that are not useful
    2. rename columns according to the schema
    3. reorder columns according to the schema

In [242]:
season_stats.head()

Unnamed: 0,Rk,Season,Conf,W,L,WLPercent,W2,L2,WL2Percent,SRS,...,TRB,AST,STL,BLK,TOV,PF,PTS.1,PTSperG,Team_ID,Dates
0,12,2008.0,MAC,23,13,1.0,10,6,1.0,3.0,...,,,,,,,,,,
1,11,2009.0,MAC,24,11,1.0,12,4,1.0,3.0,...,1199.0,446.0,270.0,80.0,490.0,745.0,2403.0,67.0,1103.0,2009.0
2,11,2009.0,MAC,24,11,1.0,12,4,1.0,3.0,...,1199.0,446.0,270.0,80.0,490.0,745.0,2403.0,67.0,1103.0,2009.0
3,10,2010.0,MAC,23,13,1.0,9,7,1.0,2.0,...,,,,,,,,,,
4,9,2011.0,MAC,22,12,1.0,13,3,1.0,6.0,...,1238.0,520.0,240.0,129.0,432.0,692.0,2524.0,70.0,1103.0,2011.0


In [243]:
del_cols = ["Rk" , 'Season' , 'W' , "L" , "W2"  , "L2" , "AP_Pre" , "AP_High" , 
           "G" , "MP" , 'FG' , "TwoP",  "ThreeP" , "FT" , "TRB" , 'PTSperG' ,
            'Dates' , 'Team_ID']

season_stats = season_stats.drop(columns = del_cols)

In [244]:
season_stats.columns

Index(['Conf', 'WLPercent', 'WL2Percent', 'SRS', 'SOS', 'PTS', 'PTS2',
       'AP_Final', 'NCAA_Tour', 'Seed', 'Coaching', 'Team', 'SrNames',
       'TeamID', 'schoolid', 'yearid', 'id', 'Team.1', 'FGA', 'FG_Percent',
       'TwoPA', 'TwoP_Percent', 'ThreePA', 'ThreeP_Percent', 'FTA',
       'FT_Percent', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS.1'],
      dtype='object')

In [245]:
season_stats.shape

(3531, 34)

In [246]:
teams.shape

(657, 3)

In [247]:
val = ~np.isnan(season_stats["id"] ) 
season_stats = season_stats[val] 

In [248]:
un = season_stats['id'].unique()

In [249]:
max(un)

1001275.0

In [250]:
max(teams["team_id"])

1000656

In [251]:
schools.shape

(217, 3)

# Start working with old Data Here

In [253]:
old_data = pd.read_csv("collected_data/season_data.csv")

In [261]:
old_data.head()

Unnamed: 0,Team,G,MP,FG,FGA,FG%,2P,2PA,2P%,3P,...,TRB,AST,STL,BLK,TOV,PF,PTS,PTS/G,Team_ID,Date
0,morehead-state,36.0,,865,1978,0.437,677,1413,0.479,188,...,1400,464,239,139,557,657,2511,69.8,1287.0,2009.0
1,connecticut,36.0,7375.0,996,2102,0.474,831,1618,0.514,165,...,1559,567,209,280,458,471,2819,78.3,1163.0,2009.0
2,duke,37.0,7425.0,971,2186,0.444,704,1420,0.496,267,...,1348,492,311,146,454,671,2867,77.5,1181.0,2009.0
3,gonzaga,34.0,6850.0,967,1982,0.488,716,1339,0.535,251,...,1276,512,246,174,386,551,2684,78.9,1211.0,2009.0
4,louisiana-state,35.0,7050.0,937,2093,0.448,724,1521,0.476,213,...,1365,529,267,212,425,609,2617,74.8,1261.0,2009.0


In [291]:
old_data.set_index(['Team_ID','Date']).index.is_unique

True

In [264]:
old_game = pd.read_csv("collected_data/ncaa_short.csv")
old_game.columns

Index(['Unnamed: 0', 'Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID',
       'LScore', 'WLoc', 'NumOT'],
      dtype='object')

In [293]:
lcols = [ "LTeamID", "Season" ]
l = old_game[lcols]

wcols = [ "WTeamID", "Season" ]
w = old_game[wcols]
w.columns = ["Team_ID" , "Date"]
l.columns = ["Team_ID" , "Date"]

teams = pd.concat([w , l] )

In [296]:
teams.groupby(['Team_ID','Date']).size().reset_index()

Unnamed: 0,Team_ID,Date,0
0,1103,2009,1
1,1103,2011,1
2,1103,2013,1
3,1104,2012,1
4,1104,2018,2
5,1106,2009,1
6,1106,2011,1
7,1107,2013,1
8,1107,2014,2
9,1107,2015,1
