## Is the high schools recruiting rating a good indicator of a high performaning athlete?  


In [1]:
# Uncomment to get the correct versions
# from IPython.display import clear_output
# !pip install -r requirements.txt
# clear_output() 


In [2]:
import numpy as np
import pandas as pd
import altair as alt
import cfbd
import warnings 

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')


In [3]:
draft = pd.read_csv('../data/draft_2010_2024.csv')
recruits = pd.read_csv('../data/recruits_2008_2020.csv')
df_team_year = pd.read_csv('../data/team_season.csv') 

In [4]:
# Sometime there are two players with the same name - this will create duplicates
# About 2k of the 39k records are duplicated due to shared names.

merged_df = pd.merge(left = recruits, right = draft, how = 'left', left_on = 'name', right_on = 'Player')

In [5]:
col = ['name', 'rating', 'ranking', 'recruit_type', 'year', 'position'
     , 'height', 'weight','latitude', 'longitude', 'state_province', 'stars'
     , 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ']


merged_df = merged_df[col]
merged_df.head()

Unnamed: 0,name,rating,ranking,recruit_type,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,athlete_id,Rnd,Pick,Player,draft_year,College/Univ
0,Da'Quan Bowers,0.9997,1.0,HighSchool,2008,SDE,76.0,265.0,33.297101,-81.03482,SC,5,Clemson,,2.0,51.0,Da'Quan Bowers,2011.0,Clemson
1,Terrelle Pryor,0.9996,2.0,HighSchool,2008,DUAL,78.0,233.0,40.328125,-79.61532,PA,5,Ohio State,,,,,,
2,Julio Jones,0.9992,3.0,HighSchool,2008,WR,76.0,220.0,30.406587,-87.683597,AL,5,Alabama,379062.0,1.0,6.0,Julio Jones,2011.0,Alabama
3,Darrell Scott,0.998,4.0,HighSchool,2008,RB,72.0,204.0,34.343509,-119.295604,CA,5,Colorado,379060.0,,,,,
4,Will Hill,0.9979,5.0,HighSchool,2008,S,73.0,207.0,40.728158,-74.077642,NJ,5,Florida,381281.0,,,,,


In [6]:
merged_df.shape

(44595, 19)

In [7]:
# Function to check if there are at least 4 sequential characters in common
# If the name of the school they committed to matches the name they were drafted from, then return a 1:
def has_common_sequence(str1, str2, min_seq_length=4):
    """
    This is a complicated function. 
    By joining from recruit data to draft data on the player's name, we create a many-to-many relationship.

    This occurs because some (but very few) recruits appear twice in the dataset (ex - Ron Smith), 
    and football players sometimes have the same name (ex - David Long). 

    This function first de-duplicates based on the combination of name and college of choice. 
    In other words we are assuming no two players at one school had the same name. 

    Next, we join to the draft dataset by looking for the player's name and college they attended, and the name and college they were drafted from. 

    More information can be found here in our milestone I project on slide 5:
    https://docs.google.com/presentation/d/1_CfHYqeOniPscvbb8VfQqQUgyf4xCSeC1spL_9M0ejw/edit#slide=id.g2b8248144f9_0_127

    """

    
    if pd.isna(str1) or pd.isna(str2):
        return 0

    for i in range(len(str1) - min_seq_length + 1):
        sequence = str1[i:i + min_seq_length]
        if sequence in str2:
            return 1
    return 0

# Apply the function to the DataFrame
merged_df['CommonSequence'] = merged_df.apply(lambda row: has_common_sequence(row['committed_to'], row['College/Univ']), axis=1)

merged_df['RN'] = merged_df.sort_values(['name', 'committed_to', 'CommonSequence'], ascending=[True, True,False]) \
                           .groupby(['name', 'rating']) \
                           .cumcount() + 1

#col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence', 'RN']
#merged_df = merged_df[col]

In [8]:
# Examples of de-duplication logic in action

# merged_df[merged_df['name'] == 'Ron Smith']
# merged_df[merged_df['name'] == 'David Long']

In [9]:
# Remove duplicates

merged_df = merged_df[merged_df['RN'] == 1]
merged_df['is_drafted'] = np.where(merged_df['Rnd'].isna(), 0.0, 1.0)

In [10]:
# Get features about the school

df_teams = pd.read_csv('../data/teams.csv')
df_teams =  df_teams[['school', 'conference', 'latitude_school', 'longitude_school']]
df_teams.head()

Unnamed: 0,school,conference,latitude_school,longitude_school
0,Air Force,Mountain West,38.99697,-104.843616
1,Akron,Mid-American,41.072553,-81.508341
2,Alabama,SEC,33.208275,-87.550384
3,App State,Sun Belt,36.211427,-81.685428
4,Arizona,Big 12,32.228805,-110.948868


In [11]:
# Add team features to our recruit-draft dataset
# NOTE: by making this an inner join, we are only including players who committed to a school, which we believe is the correct assumption. 

merged_df = pd.merge(left = merged_df, right = df_teams, left_on = 'committed_to', right_on = 'school', how = 'inner')

In [12]:
# Drop unneeded columns:
drop_cols = ['name', 'recruit_type', 'athlete_id', 'Rnd', 'Pick', 'Player'
             , 'draft_year', 'College/Univ', 'CommonSequence', 'RN', 'school']

merged_df = merged_df.drop(columns = drop_cols)

merged_df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school
3225,0.9764,45.0,2013,RB,71.0,196.0,26.461462,-80.07282,FL,4,Notre Dame,0.0,FBS Independents,41.698378,-86.233942
6055,0.8575,671.0,2013,TE,74.0,235.0,42.331551,-83.04664,MI,3,Michigan,0.0,Big Ten,42.265836,-83.748696
468,0.8933,322.0,2016,OLB,74.0,225.0,41.70033,-88.071771,IL,4,Ohio State,0.0,Big Ten,40.001645,-83.019727
12656,0.928,165.0,2010,SDE,78.0,250.0,40.233844,-111.658534,UT,4,BYU,1.0,Big 12,40.257535,-111.654525
19772,0.8021,1632.0,2009,DT,72.0,310.0,34.502303,-97.957813,OK,3,Tulsa,0.0,American Athletic,36.148918,-95.943785


In [13]:
print('merged_df: ' + str(merged_df.shape))

merged_df: (33203, 15)


In [14]:
# bring in dataset showing a team's 2-year window success
df_team_year['season'] = df_team_year['season'].fillna(0).astype(int) # one of the 2,000 records for the season column was null.. not sure why.
df_team_year = df_team_year[['school', 'season', 'wins_rolling_2year', 'games_played_rolling_2year', 'post_season_wins_rolling_2year', 'point_diff_rolling_2year', 'win_pct_rolling_2year']]
df_team_year[df_team_year['school'] == 'Michigan']

Unnamed: 0,school,season,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year
906,Michigan,2010,,,,,0.538462
907,Michigan,2011,,,,,0.846154
908,Michigan,2012,18.0,26.0,1.0,175.0,0.615385
909,Michigan,2013,19.0,26.0,1.0,337.0,0.538462
910,Michigan,2014,15.0,26.0,0.0,200.0,0.416667
911,Michigan,2015,12.0,25.0,0.0,52.0,0.769231
912,Michigan,2016,15.0,25.0,1.0,177.0,0.769231
913,Michigan,2017,20.0,26.0,1.0,536.0,0.615385
914,Michigan,2018,18.0,26.0,0.0,424.0,0.769231
915,Michigan,2019,18.0,26.0,0.0,288.0,0.692308


In [15]:
df_final = pd.merge(left = merged_df, right = df_team_year, how = 'left', left_on = ['committed_to', 'year'], right_on = ['school', 'season'])

# Game data begins in 2010.
# with a 2-year lag, the earliest year with data is 2012.
df_final = df_final[df_final['year'] >= 2012]
df_final.head()

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school,school,season,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year
87,0.9644,71.0,2012,DT,76.0,290.0,35.334011,-81.865103,NC,4,Clemson,1.0,ACC,34.678774,-82.843243,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154
88,0.9632,79.0,2012,WR,71.0,175.0,35.2272,-80.843083,NC,4,Clemson,0.0,ACC,34.678774,-82.843243,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154
89,0.9567,92.0,2012,S,73.0,195.0,30.438083,-84.280933,FL,4,Clemson,0.0,ACC,34.678774,-82.843243,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154
90,0.9264,187.0,2012,DUAL,75.0,205.0,42.886717,-78.878392,NY,4,Clemson,1.0,ACC,34.678774,-82.843243,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154
91,0.9133,231.0,2012,ATH,73.0,180.0,35.842865,-90.703452,AR,4,Clemson,1.0,ACC,34.678774,-82.843243,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154


In [16]:
df_final.to_csv('../data/M2_Final.csv', index = False)