## Is the high schools recruiting rating a good indicator of a high performaning athlete?  


In [22]:
# Uncomment to get the correct versions
from IPython.display import clear_output
!pip install -r ../requirements.txt
clear_output() 


In [23]:
import numpy as np
import pandas as pd
import altair as alt
import cfbd
import warnings 

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')


In [24]:
draft = pd.read_csv('../data/draft_2010_2024.csv')
recruits = pd.read_csv('../data/recruits_2008_2020.csv')
df_team_year = pd.read_csv('../data/team_season.csv') 

In [25]:
recruits.head() 

Unnamed: 0,index,id,athlete_id,recruit_type,year,ranking,name,school,committed_to,position,height,weight,stars,rating,city,state_province,country,latitude,longitude
0,0,12630,,HighSchool,2008,1.0,Da'Quan Bowers,Bamberg Ehrhardt,Clemson,SDE,76.0,265.0,5,0.9997,Bamberg,SC,USA,33.297101,-81.03482
1,1,12631,,HighSchool,2008,2.0,Terrelle Pryor,Jeannette,Ohio State,DUAL,78.0,233.0,5,0.9996,Jeannette,PA,USA,40.328125,-79.61532
2,2,12632,379062.0,HighSchool,2008,3.0,Julio Jones,Foley,Alabama,WR,76.0,220.0,5,0.9992,Foley,AL,USA,30.406587,-87.683597
3,3,12634,379060.0,HighSchool,2008,4.0,Darrell Scott,St. Bonaventure,Colorado,RB,72.0,204.0,5,0.998,Ventura,CA,USA,34.343509,-119.295604
4,4,12633,381281.0,HighSchool,2008,5.0,Will Hill,St. Peters Prep,Florida,S,73.0,207.0,5,0.9979,Jersey City,NJ,USA,40.728158,-74.077642


In [26]:
draft.head()

Unnamed: 0,Rnd,Pick,Tm,Player,Pos,Age,To,AP1,PB,St,wAV,DrAV,G,Cmp,Att,Yds,TD,Int,Att.1,Yds.1,TD.1,Rec,Yds.2,TD.2,Solo,Int.1,Sk,College/Univ,Unnamed: 28_level_1,draft_year
0,1,1,STL,Sam Bradford,QB,22,2018,0,0,5,44,25,83,1855,2967,19449,103,61,146,340,2,1,5,0,,,,Oklahoma,College Stats,2010
1,1,2,DET,Ndamukong Suh,DT,23,2022,3,5,12,100,59,199,0,0,0,0,0,0,0,0,0,0,0,392.0,1.0,71.5,Nebraska,College Stats,2010
2,1,3,TAM,Gerald McCoy,DT,22,2021,1,6,10,69,65,140,0,0,0,0,0,0,0,0,0,0,0,235.0,,59.5,Oklahoma,College Stats,2010
3,1,4,WAS,Trent Williams,T,22,2024,3,11,13,99,51,181,0,0,0,0,0,0,0,0,0,0,0,1.0,,,Oklahoma,College Stats,2010
4,1,5,KAN,Eric Berry,DB,21,2018,3,5,5,50,50,89,0,0,0,0,0,0,0,0,0,0,0,377.0,14.0,5.5,Tennessee,College Stats,2010


In [27]:
# Sometime there are two players with the same name - this will create duplicates
# About 2k of the 39k records are duplicated due to shared names.

merged_df = pd.merge(left = recruits, right = draft, how = 'left', left_on = 'name', right_on = 'Player')

In [28]:
col = ['name', 'rating', 'ranking', 'recruit_type', 'year', 'position'
     , 'height', 'weight','latitude', 'longitude', 'city','state_province','country', 'stars'
     , 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ']


merged_df = merged_df[col]
merged_df.head()

Unnamed: 0,name,rating,ranking,recruit_type,year,position,height,weight,latitude,longitude,city,state_province,country,stars,committed_to,athlete_id,Rnd,Pick,Player,draft_year,College/Univ
0,Da'Quan Bowers,0.9997,1.0,HighSchool,2008,SDE,76.0,265.0,33.297101,-81.03482,Bamberg,SC,USA,5,Clemson,,2.0,51.0,Da'Quan Bowers,2011.0,Clemson
1,Terrelle Pryor,0.9996,2.0,HighSchool,2008,DUAL,78.0,233.0,40.328125,-79.61532,Jeannette,PA,USA,5,Ohio State,,,,,,
2,Julio Jones,0.9992,3.0,HighSchool,2008,WR,76.0,220.0,30.406587,-87.683597,Foley,AL,USA,5,Alabama,379062.0,1.0,6.0,Julio Jones,2011.0,Alabama
3,Darrell Scott,0.998,4.0,HighSchool,2008,RB,72.0,204.0,34.343509,-119.295604,Ventura,CA,USA,5,Colorado,379060.0,,,,,
4,Will Hill,0.9979,5.0,HighSchool,2008,S,73.0,207.0,40.728158,-74.077642,Jersey City,NJ,USA,5,Florida,381281.0,,,,,


In [29]:
merged_df.shape

(44595, 21)

In [30]:
# Function to check if there are at least 4 sequential characters in common
# If the name of the school they committed to matches the name they were drafted from, then return a 1:
def has_common_sequence(str1, str2, min_seq_length=4):
    """
    This is a complicated function. 
    By joining from recruit data to draft data on the player's name, we create a many-to-many relationship.

    This occurs because some (but very few) recruits appear twice in the dataset (ex - Ron Smith), 
    and football players sometimes have the same name (ex - David Long). 

    This function first de-duplicates based on the combination of name and college of choice. 
    In other words we are assuming no two players at one school had the same name. 

    Next, we join to the draft dataset by looking for the player's name and college they attended, and the name and college they were drafted from. 

    More information can be found here in our milestone I project on slide 5:
    https://docs.google.com/presentation/d/1_CfHYqeOniPscvbb8VfQqQUgyf4xCSeC1spL_9M0ejw/edit#slide=id.g2b8248144f9_0_127

    """

    
    if pd.isna(str1) or pd.isna(str2):
        return 0

    for i in range(len(str1) - min_seq_length + 1):
        sequence = str1[i:i + min_seq_length]
        if sequence in str2:
            return 1
    return 0

# Apply the function to the DataFrame
merged_df['CommonSequence'] = merged_df.apply(lambda row: has_common_sequence(row['committed_to'], row['College/Univ']), axis=1)

merged_df['RN'] = merged_df.sort_values(['name', 'committed_to', 'CommonSequence'], ascending=[True, True,False]) \
                           .groupby(['name', 'rating']) \
                           .cumcount() + 1

#col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence', 'RN']
#merged_df = merged_df[col]

In [31]:
# Examples of de-duplication logic in action

# merged_df[merged_df['name'] == 'Ron Smith']
# merged_df[merged_df['name'] == 'David Long']

In [32]:
# Remove duplicates

merged_df = merged_df[merged_df['RN'] == 1]
merged_df['is_drafted'] = np.where(merged_df['Rnd'].isna(), 0.0, 1.0)

In [33]:
# Include Supplement Draft 
# Players that were not elgibile to make it to the draft but they were drafted 
# reference document: https://en.wikipedia.org/wiki/List_of_NFL_supplemental_draft_picks#:~:text=The%20supplemental%20draft%20was%20enacted,enter%20the%20main%20NFL%20draft.
supplement_draft = ['Josh Brent', 'Harvey Unga', 'Josh Gordon','Terrelle Pryor', 'Isaiah Battle', 'Sam Beal', 'Adonis Alexander']

# Since there are two 'Jalen Thompson', we will have to manually set this to true
merged_df.loc[(merged_df['name'] == 'Jalen Thompson') & (merged_df['year'] == 2016), 'is_drafted'] = 1.0

# Mark these as drafted as well 
for player_sd in supplement_draft: 
    merged_df.loc[merged_df['name'] == player_sd, 'is_drafted'] = 1.0

In [34]:
# Get features about the school

df_teams = pd.read_csv('../data/teams.csv')
df_teams =  df_teams[['school', 'conference', 'latitude_school', 'longitude_school']]
df_teams.head()

Unnamed: 0,school,conference,latitude_school,longitude_school
0,Air Force,Mountain West,38.99697,-104.843616
1,Akron,Mid-American,41.072553,-81.508341
2,Alabama,SEC,33.208275,-87.550384
3,App State,Sun Belt,36.211427,-81.685428
4,Arizona,Big 12,32.228805,-110.948868


In [35]:
# Add team features to our recruit-draft dataset
# NOTE: by making this an inner join, we are only including players who committed to a school, which we believe is the correct assumption. 

merged_df = pd.merge(left = merged_df, right = df_teams, left_on = 'committed_to', right_on = 'school', how = 'inner')

In [36]:
# Drop unneeded columns:
# drop_cols = ['name', 'recruit_type', 'athlete_id', 'Rnd', 'Pick', 'Player'
#              , 'draft_year', 'College/Univ', 'CommonSequence', 'RN', 'school']

drop_cols = ['recruit_type', 'Player', 'Rnd','College/Univ','school', 'Pick',  'draft_year',  'CommonSequence', 'RN']

merged_df = merged_df.drop(columns = drop_cols)

merged_df.sample(5)

Unnamed: 0,name,rating,ranking,year,position,height,weight,latitude,longitude,city,state_province,country,stars,committed_to,athlete_id,is_drafted,conference,latitude_school,longitude_school
9849,Tyler Henington,0.8715,599.0,2012,DT,77.0,260.0,39.739236,-104.984862,Denver,CO,USA,3,Colorado,-1007815.0,0.0,Big 12,40.009475,-105.266905
15900,Ahki Muhammad,0.7984,1841.0,2014,CB,69.0,175.0,37.596323,-122.08163,Union City,CA,USA,3,Nevada,3124633.0,0.0,Mountain West,39.546946,-119.817543
15990,Darryl Randolph,0.7917,1960.0,2014,CB,71.0,178.0,30.421309,-87.216915,Pensacola,FL,USA,2,Middle Tennessee,3122888.0,0.0,Conference USA,35.851146,-86.368165
3910,Bryan Collins,0.7889,1784.0,2009,OT,75.0,255.0,32.929966,-97.227125,Keller,TX,USA,2,SMU,-1030806.0,0.0,ACC,32.837722,-96.782786
32146,Nathaniel James,0.8317,1699.0,2020,DT,72.0,260.0,39.762823,-86.399717,Avon,IN,USA,3,Washington State,4430012.0,0.0,Pac-12,46.731831,-117.160499


In [37]:
    # eda_df = 
# merged_df[['athlete_id', 'Player']].isna().sum()
merged_df[['name','athlete_id', 'is_drafted', 'committed_to']].head() 
# merged_df[merged_df['name'] == 'Terrelle Pryor'].head() 
# merged_df[merged_df['school'] != merged_df['committed_to']].head() 
# eda_df[['school', 'committed_to']]

Unnamed: 0,name,athlete_id,is_drafted,committed_to
0,Da'Quan Bowers,,1.0,Clemson
1,Terrelle Pryor,,1.0,Ohio State
2,Julio Jones,379062.0,1.0,Alabama
3,Darrell Scott,379060.0,0.0,Colorado
4,Will Hill,381281.0,0.0,Florida


In [38]:
print('merged_df: ' + str(merged_df.shape))

merged_df: (33203, 19)


In [39]:
# bring in dataset showing a team's 2-year window success
df_team_year['season'] = df_team_year['season'].fillna(0).astype(int) # one of the 2,000 records for the season column was null.. not sure why.
df_team_year = df_team_year[['school', 'season', 'wins_rolling_2year', 'games_played_rolling_2year', 'post_season_wins_rolling_2year', 'point_diff_rolling_2year', 'win_pct_rolling_2year']]
df_team_year[df_team_year['school'] == 'Michigan']

Unnamed: 0,school,season,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year
906,Michigan,2010,,,,,
907,Michigan,2011,,,,,
908,Michigan,2012,18.0,26.0,1.0,175.0,0.692308
909,Michigan,2013,19.0,26.0,1.0,337.0,0.730769
910,Michigan,2014,15.0,26.0,0.0,200.0,0.576923
911,Michigan,2015,12.0,25.0,0.0,52.0,0.48
912,Michigan,2016,15.0,25.0,1.0,177.0,0.6
913,Michigan,2017,20.0,26.0,1.0,536.0,0.769231
914,Michigan,2018,18.0,26.0,0.0,424.0,0.692308
915,Michigan,2019,18.0,26.0,0.0,288.0,0.692308


In [40]:
df_team_year[df_team_year['school'] == 'Washington State']

Unnamed: 0,school,season,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year
1890,Washington State,2010,,,,,
1891,Washington State,2011,,,,,
1892,Washington State,2012,6.0,24.0,0.0,-218.0,0.25
1893,Washington State,2013,7.0,24.0,0.0,-182.0,0.291667
1894,Washington State,2014,9.0,25.0,0.0,-179.0,0.36
1895,Washington State,2015,9.0,25.0,0.0,-101.0,0.36
1896,Washington State,2016,12.0,25.0,1.0,-32.0,0.48
1897,Washington State,2017,17.0,26.0,1.0,202.0,0.653846
1898,Washington State,2018,17.0,26.0,0.0,212.0,0.653846
1899,Washington State,2019,20.0,26.0,1.0,244.0,0.769231


In [41]:
df_final = pd.merge(left = merged_df, right = df_team_year, how = 'left', left_on = ['committed_to', 'year'], right_on = ['school', 'season'])

# Game data begins in 2010.
# with a 2-year lag, the earliest year with data is 2012.
df_final = df_final[df_final['year'] >= 2012]
df_final.rename(columns={'city': 'hometown_city', 
                         'country': 'hometown_country'}, inplace=True)
df_final.head()

Unnamed: 0,name,rating,ranking,year,position,height,weight,latitude,longitude,hometown_city,state_province,hometown_country,stars,committed_to,athlete_id,is_drafted,conference,latitude_school,longitude_school,school,season,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year
9256,Dorial Green-Beckham,0.9997,1.0,2012,WR,78.0,220.0,37.216678,-93.292037,Springfield,MO,USA,5,Missouri,531380.0,1.0,SEC,38.935849,-92.333201,Missouri,2012.0,18.0,26.0,1.0,300.0,0.692308
9257,Mario Edwards,0.9995,2.0,2012,DT,75.0,285.0,33.215039,-97.133052,Denton,TX,USA,5,Florida State,530290.0,1.0,ACC,30.438169,-84.304403,Florida State,2012.0,19.0,27.0,2.0,366.0,0.703704
9258,D.J. Humphries,0.9979,3.0,2012,OT,77.0,275.0,35.2272,-80.843083,Charlotte,NC,USA,5,Florida,-1009881.0,1.0,SEC,29.649936,-82.348579,Florida,2012.0,15.0,26.0,2.0,178.0,0.576923
9259,Shaq Thompson,0.9975,4.0,2012,S,74.0,210.0,38.581572,-121.4944,Sacramento,CA,USA,5,Washington,535329.0,1.0,Big Ten,47.650323,-122.301575,Washington,2012.0,14.0,26.0,1.0,-130.0,0.538462
9260,Noah Spence,0.9974,5.0,2012,WDE,75.0,235.0,40.266311,-76.886112,Harrisburg,PA,USA,5,Ohio State,-1027150.0,1.0,Big Ten,40.001645,-83.019727,Ohio State,2012.0,18.0,26.0,1.0,363.0,0.692308


In [42]:
df_final.to_csv('../data/M2_Final.csv', index = False)