# Start working with old Data Here

In [266]:
import pandas as pd
season_data = pd.read_csv("collected_data/season_data.csv")
more_team_data = pd.read_csv("collected_data/more_team_data.csv")
ncaa_short = pd.read_csv("collected_data/ncaa_short.csv")

We would like to join the two dataframes on 'Team' and 'Date' - this is sr-name and year of season

In [267]:
# Prepare the dataframes for the merge
more_team_data = more_team_data.rename( columns = { "Season": "Date" , "team": 'Team' })
more_team_data['Date'] = more_team_data['Date'].map(lambda x: int(x[:2] + x[5:7]) )
season_data["Date"] = season_data["Date"].map(lambda x: int(x))

Merge these dataframes on there sr names and the date.

In [268]:
season_stats = pd.merge(season_data, more_team_data, on=[ 'Team' , 'Date' ] , how = 'left')

In [269]:
season_stats = season_stats.rename( columns = {'Team_ID': 'school_id'})


Add a team_id column

In [270]:
initial_value = 100001
season_stats['team_id'] = range(initial_value, len(season_stats) +initial_value)


In [271]:
teams = season_stats[["team_id" , 'school_id' , "Date"]].copy()

In [272]:
schools = season_stats[['school_id' , 'Team'  ]]

In [273]:
schools = schools.rename(columns = {'Team': "sports_reference_name"})

In [274]:
# Get the winner TeamID
ncaa_games  = pd.merge(ncaa_short, teams, left_on=[ 'WTeamID' , 'Season' ] , right_on=[ "school_id", "Date" ] ,how = 'left')
ncaa_games = ncaa_games.drop(['WTeamID' , 'school_id' , "WLoc" , "NumOT"] , axis = 1)
ncaa_games = ncaa_games.rename(columns = {'team_id': "team_1_id"})


In [275]:
# Get the Loser Team ID
ncaa_games  = pd.merge(ncaa_games, teams, left_on=[ 'LTeamID' , 'Season' ] , right_on=[ "school_id", "Date" ] ,how = 'left')
ncaa_games = ncaa_games.drop(['LTeamID' , 'school_id' , "Date_x" , "Date_y" , "Season" , "Unnamed: 0" , "DayNum"] , axis = 1)
ncaa_games = ncaa_games.rename(columns = {'team_id': "team_2_id" , "WScore": "team_1_score" , "LScore": "team_2_score"})


In [276]:
initial_value = 1
ncaa_games['game_id'] = range(initial_value, len(ncaa_games) +initial_value)


In [277]:
season_stats.columns

Index(['Team', 'G', 'MP', 'FG', 'FGA', 'FG%', '2P', '2PA', '2P%', '3P', '3PA',
       '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS_x', 'PTS/G', 'school_id', 'Date', 'Rk', 'Conf', 'W',
       'L', 'W-L%', 'W.1', 'L.1', 'W-L%.1', 'SRS', 'SOS', 'PTS_y', 'PTS.1',
       'AP Pre', 'AP High', 'AP Final', 'NCAA Tournament', 'Seed', 'Coach(es)',
       'Unnamed: 19', 'team_id'],
      dtype='object')

In [278]:
# Lets only keep the numerical columns we care about
cols_keep = ['team_id', 'FG' , 'FGA', 'FG%', '2P' , '2PA', '2P%', '3P' , '3PA', '3P%', 'FT' , 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB' , 'AST', 'STL', 'BLK', 'TOV', 'PF',  'PTS/G','W-L%', 'SRS', 'SOS','PTS.1','Seed' ]
season_stats = season_stats[cols_keep]


season_stats = season_stats.rename(columns = {"PS.1": "opp_PPG"})


# At This point, We should Have The Schema

### Let's first see about null values


In [279]:
def check_null(df):
    """
    This function takes in a df as an argument, and returns the null values by column
    """
    null_columns=df.columns[df.isnull().any()]
    return df[null_columns].isnull().sum()
    

with the above function, we find that our data has no null values!

In [280]:
check_null(ncaa_games)

Series([], dtype: float64)

In [281]:
check_null(season_stats)

Series([], dtype: float64)

In [282]:
check_null(teams)

Series([], dtype: float64)

In [283]:
check_null(schools)

Series([], dtype: float64)

#### Shuffle team1 and team2 in the ncaa_games df -> eliminate selection bias

In [284]:
def shuffle(row):
    """
    This function will randomly shuffle which team 
    is team 1.  Origionally, the winner is always team 1
    """
    from random import choice
    val = choice([0 , 1])
    if val ==0:
        # everything stays the same
        pass
    else:
        # switch the teams
        ph1 = row["team_1_id"]
        ph2 = row["team_1_score"]
        row["team_1_score"] = row["team_2_score"] 
        row["team_1_id"] = row["team_2_id"]
        row["team_2_id"] = ph1
        row["team_2_score"] = ph2
        return row


In [286]:
ncaa_games.apply(shuffle , axis = 1 )

0                                                   None
1                                                   None
2      [4487424360, 4487424360, 4487424360, 448742436...
3                                                   None
4                                                   None
5      [4487424360, 4487424360, 4487424360, 448742436...
6                                                   None
7                                                   None
8                                                   None
9                                                   None
10     [4487424360, 4487424360, 4487424360, 448742436...
11                                                  None
12                                                  None
13                                                  None
14     [4487424360, 4487424360, 4487424360, 448742436...
15                                                  None
16     [4487424360, 4487424360, 4487424360, 448742436...
17                             

In [290]:
import os

# make folder
if not os.path.exists('./clean_data'):
    os.mkdir('./clean_data')

# write our df's
season_stats.to_csv('clean_data/season_stats.csv' , index = False)
teams.to_csv('clean_data/teams.csv' , index = False)
schools.to_csv('clean_data/schools.csv' , index = False)
ncaa_games.to_csv('clean_data/ncaa_games.csv' , index = False)
