# Goal:
This notebook is used to create a predictive model to see what college basketball player could be be drafted into the nba. The model will look at highschool rankings and college basketball stats to predict if this player could be an NBA player. We use this model to get a percentage that the player could get drafted and use that as a threshold for our second model. 

In [1]:
! pip install -e "git+https://github.com/perrygeo/jenks.git#egg=jenks"
import pandas as pd
import numpy as np
import getpass
import psycopg2
from sklearn.externals import joblib as jb
from jenks import jenks
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

Obtaining jenks from git+https://github.com/perrygeo/jenks.git#egg=jenks
  Updating ./src/jenks clone
  Running command git fetch -q --tags
  Running command git reset --hard -q 80b2557d635ba4a0640157d3616b50dbf39c2221
Installing collected packages: jenks
  Found existing installation: jenks 1.0
    Uninstalling jenks-1.0:
      Successfully uninstalled jenks-1.0
  Running setup.py develop for jenks
Successfully installed jenks




Connect to our database

In [2]:
mypasswd = getpass.getpass()
conn = psycopg2.connect(database = 'cs20_group4',
                              user = 'fhfrf', #replace with pawprint
                              host = 'pgsql.dsa.lan',
                              password = mypasswd)
cursor = conn.cursor()

········


Query our database for the NCAA data

In [3]:
historical_data = pd.read_sql_query("""
DROP TABLE IF EXISTS college;

select 
base.*,
most_minutes.assist_percentage,
most_minutes.turnover_percentage,
most_minutes.usage_percentage,
position.position,
team.team_abbreviation,
conference.conference
INTO TEMP TABLE college
from
--Base query where all simple aggregations are performed, usually sum or max. For some percentage fields,
    --the field are here recalculated according to the definitions in the SportsReference package documentation.
    (select 
    sum(assists) as assists,
    sum(blocks) as blocks,
    max(avg_conf_rank) as conference_rank,
    sum(defensive_rebounds) as defensive_rebounds,
    sum(defensive_win_shares) as defensive_win_shares,
    sum(field_goal_attempts) as field_goal_attempts,
    --field_goal_percentage,
    case when sum(field_goal_attempts) > 0 then
    sum(field_goals)/sum(field_goal_attempts)
    else null end as field_goal_percentage,
    sum(field_goals) as field_goals,
    --free_throw_attempt_rate,
    case when sum(field_goal_attempts) > 0 then
    sum(free_throw_attempts)/sum(field_goal_attempts) 
    else null end as free_throw_attempt_rate,
    sum(free_throw_attempts) as free_throw_attempts,
    --free_throw_percentage,
    case when sum(free_throw_attempts) > 0 then
    sum(free_throws)/sum(free_throw_attempts) 
    else null end as free_throw_percentage,
    sum(free_throws) as free_throws,
    sum(games_played) as games_played,
    sum(games_started) as games_started,
    max(height) as height,
    sum(minutes_played) as minutes_played,
    sum(offensive_rebounds) as offensive_rebounds,
    sum(offensive_win_shares) as offensive_win_shares,
    sum(personal_fouls) as personal_fouls,
    player_id,
    sum(points) as points,
    sum(steals) as steals,
    --three_point_attempt_rate,
    case when sum(field_goal_attempts) > 0 then
    sum(three_point_attempts)/sum(field_goal_attempts) 
    else null end as three_point_attempt_rate,
    sum(three_point_attempts) as three_point_attempts,
    --three_point_percentage,
    case when sum(three_point_attempts) > 0 then
    sum(three_pointers)/sum(three_point_attempts) 
    else null end as three_point_percentage,
    sum(three_pointers) as three_pointers,
    sum(total_rebounds) as total_rebounds,
    sum(turnovers) as turnovers,
    sum(two_point_attempts) as two_point_attempts,
    --two_point_percentage,
    case when sum(two_point_attempts) > 0 then
    sum(two_pointers)/sum(two_point_attempts) 
    else null end as two_point_percentage,
    sum(two_pointers) as two_pointers,
    max(weight) as weight,
    sum(win_shares) as win_shares,
    max(player_season_number) as seasons_played
    from 
    ncaa_player_career_stats2
    where
    season >= '2001'
    group by
    player_id) base
--This outer join adds columns for percentages that could not be recaclulated.
    --Instead, this query takes the value from each of these fields during the season in which a player logged the most minutes of playing time.
left outer join
    (select distinct
    sub1.player_id,
    avg(sub2.assist_percentage) as assist_percentage,
    avg(sub2.turnover_percentage) as turnover_percentage,
    avg(sub2.usage_percentage) as usage_percentage,
    avg(sub2.win_shares_per_40_minutes) as win_shares_per_40_minutes,
    avg(sub2.block_percentage) as block_percentage 
            --In tandem, these two queries filter the dataframe to just the seasons where each player played the most minutes.
    from
    (select
    player_id,
    max(minutes_played) as minutes_played
    from 
    ncaa_player_career_stats2
    where
    season >= '2001'
    group by player_id)sub1
    left outer join
    (select
    player_id,
    minutes_played,
    assist_percentage,
    turnover_percentage,
    usage_percentage,
    win_shares_per_40_minutes,
    block_percentage
    from 
    ncaa_player_career_stats2)sub2
    on sub1.player_id = sub2.player_id and sub1.minutes_played = sub2.minutes_played
    group by
    sub1.player_id) most_minutes
    on base.player_id = most_minutes.player_id
--Each of the following outer joins brings in a column that is non-numeric.  The mode is used for some of these columns, like school, so if a player transfered, we treat the school he played the most seasons for as his school.  In the case of a tie, a value is arbatrarily chosen.
left outer join
    (select player_id, mode() within group(order by conference) as conference
    from
    ncaa_player_career_stats2
    group by
    player_id)conference
    on conference.player_id = base.player_id
    left outer join
    (select player_id, mode() within group(order by team_abbreviation) as team_abbreviation
    from
    ncaa_player_career_stats2
    group by
    player_id)team
    on team.player_id = base.player_id
    left outer join
    (select player_id, mode() within group(order by position) as position
    from
    ncaa_player_career_stats2
    group by
    player_id)position
    on position.player_id = base.player_id;
    
SELECT
    p.ncaa_id
    , p.nba_id
    , p.player_name
    , p.height
    , p.weight
    , p.high_school_national_rank
    , p.high_school_recruit_score
    , p.draft_rank
    , p.school_or_team
    , p.highest_level
    , COALESCE(p.nba_draft_year, p.final_college_season, p.high_school_class) - COALESCE(p.high_school_class, p.first_college_season) + 18 as age
    , c.conference
    , c.assists
    , c.blocks
    , c.defensive_rebounds
    , c.field_goal_attempts
    , c.field_goals
    , c.field_goal_percentage
    , c.free_throw_attempt_rate
    , c.free_throw_attempts
    , c.free_throw_percentage
    , c.games_played
    , c.games_started
    , c.minutes_played
    , c.offensive_rebounds
    , c.personal_fouls
    , c.points
    , c.steals
    , c.three_point_attempt_rate
    , c.three_point_attempts
    , c.three_point_percentage
    , c.three_pointers
    , c.total_rebounds
    , c.turnovers
    , c.two_point_attempts
    , c.two_point_percentage
    , c.two_pointers
    , c.seasons_played
    , c.assist_percentage
    , c.turnover_percentage
    , c.usage_percentage
    , c.position
    , c.defensive_win_shares
    , c.offensive_win_shares
    , r.role as nba_career_quality
    , cl.offensive_role as nba_offensive_role
    , cl.defensive_role as nba_defensive_role
    , CASE
        WHEN cr.player_id IS NOT NULL
            THEN 1
            ELSE 0
        END as currently_in_school
FROM college c
JOIN player_master_table p ON c.player_id = p.ncaa_id
LEFT JOIN nba_player_roles r ON p.nba_id = r.player_id
LEFT JOIN nba_player_clusters cl ON p.nba_id = cl.player_id
LEFT JOIN ncaa_current_rosters cr ON p.ncaa_id = cr.player_id
WHERE c.minutes_played IS NOT NULL
    AND p.final_college_season >= 2009
""", con = conn)
historical_data = historical_data.drop(columns = ['currently_in_school'])
print(historical_data.shape)
historical_data.head()

(22234, 47)


Unnamed: 0,ncaa_id,nba_id,player_name,height,weight,high_school_national_rank,high_school_recruit_score,draft_rank,school_or_team,highest_level,age,conference,assists,blocks,defensive_rebounds,field_goal_attempts,field_goals,field_goal_percentage,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,games_played,games_started,minutes_played,offensive_rebounds,personal_fouls,points,steals,three_point_attempt_rate,three_point_attempts,three_point_percentage,three_pointers,total_rebounds,turnovers,two_point_attempts,two_point_percentage,two_pointers,seasons_played,assist_percentage,turnover_percentage,usage_percentage,position,defensive_win_shares,offensive_win_shares,nba_career_quality,nba_offensive_role,nba_defensive_role
0,aakim-saintil-1,,Aakim Saintil,72.0,165.0,,,,LIU-Brooklyn,NCAA,18.0,northeast,146.0,1.0,57.0,304.0,112.0,0.368421,0.546053,166.0,0.807229,31.0,25.0,906.0,12.0,104.0,392.0,46.0,0.427632,130.0,0.261538,34.0,69.0,90.0,174.0,0.448276,78.0,1.0,0.307,0.19,0.245,Guard,0.9,1.2,,,
1,aakim-saintill-1,,Aakim Saintil,72.0,165.0,284.0,0.8333,,South Alabama,NCAA,19.0,sun-belt,57.0,2.0,46.0,167.0,55.0,0.329341,0.526946,88.0,0.738636,31.0,10.0,705.0,16.0,102.0,194.0,32.0,0.419162,70.0,0.271429,19.0,62.0,45.0,97.0,0.371134,36.0,1.0,0.165,0.177,0.183,Guard,0.8,0.2,,,
2,aalim-moor-1,,Aalim Moor,75.0,194.0,535.0,0.8181,,San Jose State,NCAA,22.0,wac,29.0,3.0,22.0,73.0,32.0,0.438356,0.041096,3.0,0.333333,65.0,2.0,359.0,3.0,40.0,75.0,10.0,0.328767,24.0,0.416667,10.0,25.0,21.0,49.0,0.44898,22.0,4.0,0.216,0.259,0.138,Guard,0.1,0.1,,,
3,aamahd-walker-1,,Aamahd Walker,74.0,190.0,,,,UCSB,NCAA,19.0,big-west,2.0,1.0,8.0,21.0,8.0,0.380952,0.428571,9.0,0.555556,17.0,0.0,73.0,4.0,7.0,22.0,5.0,0.238095,5.0,0.2,1.0,12.0,5.0,16.0,0.4375,7.0,2.0,0.063,0.21,0.207,Guard,0.1,0.0,,,
4,aamahne-santos-1,,Aamahne Santos,70.0,165.0,,,,Jacksonville,NCAA,18.0,atlantic-sun,78.0,2.0,47.0,202.0,73.0,0.361386,0.143564,29.0,0.965517,32.0,31.0,915.0,10.0,43.0,217.0,20.0,0.628713,127.0,0.338583,43.0,57.0,49.0,75.0,0.4,30.0,1.0,0.139,0.185,0.138,Guard,0.6,0.5,,,


Drop duplicate player entries by selecting the entry that has the highest high school national rank. Players that transfer to other schools will typically have a NA for their high recruiting data columns.

In [4]:
historical_data = historical_data.sort_values(by = ['high_school_national_rank']).drop_duplicates(subset = 'ncaa_id').reset_index(drop = True)

Query our current active NCAA players

In [5]:
active_ncaa_master = pd.read_sql_query("""
    SELECT act.*, prop.rank, com.name, com.national_rank, com.recruit_score 
    FROM NCAA_active_players_career_stats as act
    LEFT JOIN nba_top100_prospect_rankings as prop ON act.player_name = prop.player
    LEFT JOIN hs_col_commit_247sports as com ON com.name = act.player_name
""", con = conn)
active_ncaa_master.shape

(11117, 58)

Cast the numerical data columns from object to float so they will appear in our selection when we need to filter for numerical columns only.

In [6]:
heights=[]
for index, row in active_ncaa_master.iterrows():
    try:
        foot_inch_height = row['height'].split('-')
        foot_inch_height = list(map(int, foot_inch_height))
        inches_height = foot_inch_height[0]*12 + foot_inch_height[1]
        heights.append(inches_height)
    except:
        heights.append(np.nan)
active_ncaa_master['height'] = heights
active_ncaa_master['weight'] = active_ncaa_master['weight'].astype(float)
active_ncaa_master['field_goal_percentage'] = active_ncaa_master['field_goal_percentage'].astype(float)
active_ncaa_master['three_point_percentage'] = active_ncaa_master['three_point_percentage'].astype(float)
active_ncaa_master['free_throw_percentage'] = active_ncaa_master['free_throw_percentage'].astype(float)
active_ncaa_master['free_throw_percentage'] = active_ncaa_master['free_throw_percentage'].astype(float)
active_ncaa_master['usage_percentage'] = active_ncaa_master['usage_percentage'].astype(float)
active_ncaa_master['assist_percentage'] = active_ncaa_master['assist_percentage'].astype(float)
active_ncaa_master['turnover_percentage'] = active_ncaa_master['turnover_percentage'].astype(float)
active_ncaa_master['two_point_percentage'] = active_ncaa_master['two_point_percentage'].astype(float)
active_ncaa_master['three_point_attempt_rate'] = active_ncaa_master['three_point_attempt_rate'].astype(float)
active_ncaa_master['free_throw_attempt_rate'] = active_ncaa_master['free_throw_attempt_rate'].astype(float)
active_ncaa_master.rename(columns = {'player_id': 'ncaa_id', 'national_rank': 'high_school_national_rank', 'rank': 'draft_rank', 'recruit_score': 'high_school_recruit_score'}, inplace = True)

Generate a `made_nba` column by creating a binary variable that is true if a player made it to the NBA and false if they did not.

In [7]:
historical_data['made_nba'] = (historical_data.highest_level == 'NBA').astype(int)

Combine our historical and active player dataframes together so we can bin all the data against each other. 

In [8]:
frames = [historical_data, active_ncaa_master]
common_cols = list(set.intersection(*(set(df.columns) for df in frames)))
active_player_ids = active_ncaa_master.ncaa_id

In [9]:
data = pd.concat([df[common_cols] for df in frames], ignore_index=True)

Select and join NBA data into our NCAA dataset. For the historical NCAA players (players who are not active in the NCAA), some will have NBA data if they were drafted into the NBA. 

In [10]:
historical_data_nba_quality = historical_data[['ncaa_id', 'nba_career_quality', 'made_nba']]

In [11]:
data = data.merge(historical_data_nba_quality, how = 'left', on = 'ncaa_id')

In [12]:
data = data.sort_values(by = ['high_school_national_rank']).drop_duplicates(subset = 'ncaa_id').reset_index(drop = True)

We have some data that we need to manually adjust to ensure correctness.

In [13]:
data.loc[data.player_name == 'Cam Reddish', 'draft_rank'] = 16.0
data.loc[data.player_name == 'Byron Mullens', 'draft_rank'] = 9.0
data.loc[data.player_name == 'Jeff Ayres', 'draft_rank'] = 39.0
data.loc[data.player_name == 'Joe Young', 'draft_rank'] = 44.0
data.loc[data.player_name == 'Maurice Harkless', 'draft_rank'] = 14.0

Change the rows with Null values into zeros, or in the cases with ranks, make them outside the higher rank range.

In [14]:
data.loc[data.field_goal_percentage.isnull(), 'field_goal_percentage'] = 0
data.loc[data.free_throw_attempt_rate.isnull(), 'free_throw_attempt_rate'] = 0
data.loc[data.free_throw_percentage.isnull(), 'free_throw_percentage'] = 0
data.loc[data.three_point_attempt_rate.isnull(), 'three_point_attempt_rate'] = 0
data.loc[data.three_point_percentage.isnull(), 'three_point_percentage'] = 0
data.loc[data.two_point_percentage.isnull(), 'two_point_percentage'] = 0
data.loc[data.assist_percentage.isnull(), 'assist_percentage'] = 0
data.loc[data.turnover_percentage.isnull(), 'turnover_percentage'] = 0
data.loc[data.usage_percentage.isnull(), 'usage_percentage'] = 0
data.loc[data.high_school_national_rank.isnull(), 'high_school_national_rank'] = 1001
data.loc[data.high_school_recruit_score.isnull(), 'high_school_recruit_score'] = 0
data.loc[data.draft_rank.isnull(), 'draft_rank'] = 101

Now that we have the data in a dataframe we can start creating a few more important columns. The features we put into the model are important to not over value small sample sizes, so we create columns with the counting stats in a per minute form. This will normalize some of the effect of the small sample sizes. Then we put all our features into a list to call easily later. 

In [15]:
data['height_to_weight'] = data.height / data.weight
data['weight_to_height'] = data.weight / data.height
data['recruit_draft_rank_mean'] = data[['high_school_national_rank', 'draft_rank']].mean(axis = 1)
data['draft_recruit_ratio'] = data.draft_rank.astype(int) / data.high_school_national_rank.astype(int)
data['assist_to_turnover_ratio'] = data.assists / data.turnovers
data['assists_per_minute'] = data.assists / data.minutes_played
data['blocks_per_minute'] = data['blocks'] / data.minutes_played
data['drb_per_minute'] = data.defensive_rebounds / data.minutes_played
data['fga_per_minute'] = data.field_goal_attempts / data.minutes_played
data['fgm_per_minute'] = data.field_goals / data.minutes_played
data['start_rate'] = data.games_started / data.games_played
data['minutes_per_game'] = data.minutes_played / data.games_played
data['orb_per_minute'] = data.offensive_rebounds / data.minutes_played
data['drb_orb_ratio'] = data.defensive_rebounds / data.offensive_rebounds
data['fouls_per_minute'] = data.personal_fouls / data.minutes_played
data['points_per_minute'] = data.points / data.minutes_played
data['steals_per_minute'] = data.steals / data.minutes_played
data['stocks'] = data.steals + data['blocks']
data['stocks_per_minute'] = data.stocks / data.minutes_played
data['three_attempts_per_minute'] = data.three_point_attempts / data.minutes_played
data['threes_made_per_minute'] = data.three_pointers / data.minutes_played
data['three_to_two_attempt_ratio'] = data.three_point_attempts / data.two_point_attempts
data['three_to_two_make_ratio'] = data.three_pointers / data.two_pointers
data['points_rebounds_ratio'] = data.points / data.total_rebounds
data['points_assists_ratio'] = data.points / data.assists
data['assists_rebounds_ratio'] = data.assists / data.total_rebounds
data['points_rebounds_assists'] = data[['points', 'total_rebounds', 'assists']].sum(axis = 1)
data['points_rebounds_assists_per_minute'] = data.points_rebounds_assists / data.minutes_played
data['win_share_diff_off'] = data.offensive_win_shares - data.defensive_win_shares
data['win_share_diff_def'] = data.defensive_win_shares - data.offensive_win_shares
data['off_ws_per_minute'] = data.offensive_win_shares / data.minutes_played
data['def_ws_per_minute'] = data.defensive_win_shares / data.minutes_played
data['total_win_shares'] = data.offensive_win_shares + data.defensive_win_shares
data['win_shares_per_minute'] = data.total_win_shares / data.minutes_played

We only want to feed in numerical columns into our binning so let's only select numerical columns.

In [16]:
features = data.select_dtypes(['int', 'float']).columns.to_list()
features

['field_goals',
 'three_point_attempt_rate',
 'defensive_win_shares',
 'blocks',
 'games_played',
 'turnovers',
 'turnover_percentage',
 'total_rebounds',
 'three_pointers',
 'games_started',
 'three_point_percentage',
 'offensive_win_shares',
 'weight',
 'free_throw_percentage',
 'two_pointers',
 'high_school_national_rank',
 'defensive_rebounds',
 'free_throw_attempts',
 'field_goal_percentage',
 'two_point_attempts',
 'field_goal_attempts',
 'two_point_percentage',
 'steals',
 'free_throw_attempt_rate',
 'points',
 'usage_percentage',
 'three_point_attempts',
 'offensive_rebounds',
 'high_school_recruit_score',
 'draft_rank',
 'assists',
 'personal_fouls',
 'assist_percentage',
 'minutes_played',
 'height',
 'made_nba',
 'height_to_weight',
 'weight_to_height',
 'recruit_draft_rank_mean',
 'draft_recruit_ratio',
 'assist_to_turnover_ratio',
 'assists_per_minute',
 'blocks_per_minute',
 'drb_per_minute',
 'fga_per_minute',
 'fgm_per_minute',
 'start_rate',
 'minutes_per_game',
 'orb_

Fill in 0s for null features. Examples of these features could be shooting percentages where a player will get a null value if they did not attempt a three pointer.

In [17]:
null_features = [x for x in data[features].columns.to_list() if data[x].isnull().sum() > 0]

In [18]:
for nf in null_features:
    data[nf].fillna(0, inplace = True)

In [19]:
inf_features = [x for x in data[features].columns.to_list() if (data[x] == np.inf).sum() > 0]

In [20]:
for i in inf_features:
    data.loc[data[i] == np.inf, i] = 1

The next step is to start binning our data. The goal for binning was to have some reason behind the size of our bins. This was accomplished by creating the functions below. The python package jenks bins the data in different sizes and then we found the variance fit for each of the sizes of bins and selected the bin that had a variance fit of 0.9. This was to not over fit our binning. We ran these functions across all of our features to select the correct bin size for all of them. 

After the binning was completed we created dummy columns for the bins to get every column to be a binary column. 

In [21]:
def goodness_of_variance_fit(array, classes):
    # get the break points
    classes = jenks(array, classes)

    # do the actual classification
    classified = np.searchsorted(classes, array, side = 'right')

    # max value of zones
    maxz = max(classified)

    # nested list of zone indices
    zone_indices = [[idx for idx, val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)]

    # sum of squared deviations from array mean
    sdam = np.sum((array - array.mean()) ** 2)

    # sorted polygon stats
    array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices]

    # sum of squared deviations of class means
    sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort])

    # goodness of variance fit
    gvf = (sdam - sdcm) / sdam

    return gvf

In [22]:
def bin_selector(data, threshold = 0.9):
    bins = 1
    gvf = 0
    while gvf < threshold:
        gvf = goodness_of_variance_fit(data, bins + 1)
        bins += 1
    return bins - 1

In [23]:
def bin_assigner(data):
    bins = np.searchsorted(jenks(data, bin_selector(data)), data, side = 'right')
    return bins

Bin the data feature by feature so we can develop more deliberate patterns in the data to feed into our model. 

In [24]:
for f in features:
    print(f)
    if f == 'draft_rank':
        data['{}_bin'.format(f)] = None
        data.loc[data[f] <= 100, '{}_bin'.format(f)] = bin_assigner(np.array(data.loc[data[f] <= 100][f]))
        data.loc[data[f] > 100, '{}_bin'.format(f)] = data['{}_bin'.format(f)].max() + 1
    else:
        data['{}_bin'.format(f)] = bin_assigner(data[f])

field_goals
three_point_attempt_rate
defensive_win_shares
blocks
games_played
turnovers
turnover_percentage
total_rebounds
three_pointers
games_started
three_point_percentage
offensive_win_shares
weight
free_throw_percentage
two_pointers
high_school_national_rank
defensive_rebounds
free_throw_attempts
field_goal_percentage
two_point_attempts
field_goal_attempts
two_point_percentage
steals
free_throw_attempt_rate
points
usage_percentage
three_point_attempts
offensive_rebounds
high_school_recruit_score


  ret = ret.dtype.type(ret / rcount)


draft_rank
assists
personal_fouls
assist_percentage
minutes_played
height
made_nba
height_to_weight
weight_to_height
recruit_draft_rank_mean
draft_recruit_ratio
assist_to_turnover_ratio
assists_per_minute
blocks_per_minute
drb_per_minute
fga_per_minute
fgm_per_minute
start_rate
minutes_per_game
orb_per_minute
drb_orb_ratio
fouls_per_minute
points_per_minute
steals_per_minute
stocks
stocks_per_minute
three_attempts_per_minute
threes_made_per_minute
three_to_two_attempt_ratio
three_to_two_make_ratio
points_rebounds_ratio
points_assists_ratio
assists_rebounds_ratio
points_rebounds_assists
points_rebounds_assists_per_minute
win_share_diff_off
win_share_diff_def
off_ws_per_minute
def_ws_per_minute
total_win_shares
win_shares_per_minute


Grab binned columns so we can properly add categorical dummy variables to our data/

In [25]:
binned_cols = [x for x in data.columns.to_list() if '_bin' in x]

In [26]:
df = pd.get_dummies(data, columns = binned_cols)

In [27]:
new_features = [x for x in df.columns.to_list() if '_bin' in x] + features

Add in our new binned features to our dataframe

In [35]:
df = df[['ncaa_id', 'player_name', 'nba_career_quality', 'made_nba'] + new_features]

Split our dataframe back into historical and active NCAA player datasets based on ids found in the `active_player_ids` list.

In [36]:
active_df = df.loc[df['ncaa_id'].isin(list(active_player_ids))]
historical_df = df.loc[~df['ncaa_id'].isin(list(active_player_ids))]

In [37]:
active_df.to_csv('../Data/active_binned.csv', index = False)
historical_df.to_csv('../Data/historical_binned.csv', index = False)