## Is the high schools recruiting rating a good indicator of a high performaning athlete?  


In [1]:
# Uncomment to get the correct versions
# from IPython.display import clear_output
# !pip install -r requirements.txt
# clear_output() 


In [2]:
import numpy as np
import pandas as pd
import altair as alt
import cfbd
import warnings 

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')


In [3]:
draft = pd.read_csv('../../data/draft.csv')
recruits = pd.read_csv('../../data/recruits.csv')

In [4]:
# Sometime there are two players with the same name - this will create duplicates
# About 2k of the 39k records are duplicated due to shared names.

merged_df = pd.merge(left = recruits, right = draft, how = 'left', left_on = 'name', right_on = 'Player')

In [5]:
col = ['name', 'rating', 'ranking', 'recruit_type', 'year', 'position'
     , 'height', 'weight','latitude', 'longitude', 'state_province', 'stars'
     , 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ']


merged_df = merged_df[col]
merged_df.head()

Unnamed: 0,name,rating,ranking,recruit_type,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,athlete_id,Rnd,Pick,Player,draft_year,College/Univ
0,Trenton Thompson,0.9992,1.0,HighSchool,2015,DT,74.0,313.0,31.578206,-84.155681,GA,5,Georgia,3915192.0,,,,,
1,Trent Thompson,0.9991,1.0,HighSchool,2015,DT,74.5,313.0,31.578206,-84.155681,GA,5,Georgia,,,,,,
2,Martez Ivey,0.999,2.0,HighSchool,2015,OT,77.5,275.0,28.677968,-81.511521,FL,5,Florida,-1009710.0,,,,,
3,Byron Cowart,0.9987,3.0,HighSchool,2015,SDE,76.0,250.0,27.998541,-82.274884,FL,5,Auburn,3916922.0,5.0,159.0,Byron Cowart,2019.0,Maryland
4,Iman Marshall,0.9985,4.0,HighSchool,2015,CB,73.0,190.0,33.769016,-118.191605,CA,5,USC,3912545.0,4.0,127.0,Iman Marshall,2019.0,USC


In [6]:
merged_df.shape

(31886, 19)

In [7]:
# Function to check if there are at least 4 sequential characters in common
# If the name of the school they committed to matches the name they were drafted from, then return a 1:
def has_common_sequence(str1, str2, min_seq_length=4):
    """
    This is a complicated function. 
    By joining from recruit data to draft data on the player's name, we create a many-to-many relationship.

    This occurs because some (but very few) recruits appear twice in the dataset (ex - Ron Smith), 
    and football players sometimes have the same name (ex - David Long). 

    This function first de-duplicates based on the combination of name and college of choice. 
    In other words we are assuming no two players at one school had the same name. 

    Next, we join to the draft dataset by looking for the player's name and college they attended, and the name and college they were drafted from. 

    More information can be found here in our milestone I project on slide 5:
    https://docs.google.com/presentation/d/1_CfHYqeOniPscvbb8VfQqQUgyf4xCSeC1spL_9M0ejw/edit#slide=id.g2b8248144f9_0_127

    """

    
    if pd.isna(str1) or pd.isna(str2):
        return 0

    for i in range(len(str1) - min_seq_length + 1):
        sequence = str1[i:i + min_seq_length]
        if sequence in str2:
            return 1
    return 0

# Apply the function to the DataFrame
merged_df['CommonSequence'] = merged_df.apply(lambda row: has_common_sequence(row['committed_to'], row['College/Univ']), axis=1)

merged_df['RN'] = merged_df.sort_values(['name', 'committed_to', 'CommonSequence'], ascending=[True, True,False]) \
                           .groupby(['name', 'rating']) \
                           .cumcount() + 1

#col = ['name', 'rating', 'stars', 'committed_to', 'athlete_id', 'Rnd', 'Pick', 'Player', 'draft_year', 'College/Univ', 'CommonSequence', 'RN']
#merged_df = merged_df[col]

In [8]:
# Examples of de-duplication logic in action

# merged_df[merged_df['name'] == 'Ron Smith']
# merged_df[merged_df['name'] == 'David Long']

In [9]:
# Remove duplicates

merged_df = merged_df[merged_df['RN'] == 1]
merged_df['is_drafted'] = np.where(merged_df['Rnd'].isna(), 0.0, 1.0)

In [10]:
# Get features about the school

df_teams = pd.read_csv('../../data/teams.csv')
df_teams =  df_teams[['school', 'conference', 'latitude_school', 'longitude_school']]
df_teams.head()

Unnamed: 0,school,conference,latitude_school,longitude_school
0,Air Force,Mountain West,38.99697,-104.843616
1,Akron,Mid-American,41.072553,-81.508341
2,Alabama,SEC,33.208275,-87.550384
3,Appalachian State,Sun Belt,36.211427,-81.685428
4,Arizona,Pac-12,32.228805,-110.948868


In [11]:
# Add team features to our recruit-draft dataset

merged_df = pd.merge(left = merged_df, right = df_teams, left_on = 'committed_to', right_on = 'school')

In [12]:
# Drop unneeded columns:
drop_cols = ['name', 'recruit_type', 'athlete_id', 'Rnd', 'Pick', 'Player'
             , 'draft_year', 'College/Univ', 'CommonSequence', 'RN', 'school']

merged_df = merged_df.drop(columns = drop_cols)

merged_df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school
1304,0.8773,519.0,2018,DT,75.0,290.0,30.289083,-91.234274,LA,3,LSU,0.0,SEC,30.412035,-91.183816
7661,0.8816,468.0,2018,ATH,77.0,260.0,40.379791,-79.809492,PA,3,West Virginia,0.0,Big 12,39.650274,-79.955187
4894,0.8958,380.0,2023,IOL,75.0,280.0,37.271395,-107.881598,CO,4,Oklahoma,0.0,Big 12,35.205854,-97.442314
19217,0.8255,1679.0,2018,WDE,76.0,241.0,34.053691,-118.242767,CA,3,UNLV,0.0,Mountain West,36.167256,-115.148516
15375,0.7681,3113.0,2017,CB,68.5,176.0,34.8785,-76.901446,NC,2,Navy,0.0,American Athletic,38.984699,-76.507633


In [13]:
merged_df.to_csv('../../data/M2_final.csv', index = False)
print('merged_df: ' + str(merged_df.shape))

merged_df: (22729, 15)


In [14]:
merged_df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school
9749,0.8661,937.0,2023,ATH,73.0,185.0,41.833365,-87.867732,IL,3,Wisconsin,0.0,Big Ten,43.06994,-89.412694
16356,0.8281,1735.0,2021,OLB,72.0,201.0,30.0666,-90.4801,LA,3,Tulane,0.0,American Athletic,29.944616,-90.116692
13181,0.8304,1363.0,2016,OLB,75.0,195.0,39.360059,-84.309939,OH,3,Cincinnati,0.0,Big 12,39.13125,-84.516191
16114,0.859,638.0,2017,ATH,73.0,195.0,32.509311,-92.119301,LA,3,Louisiana Tech,0.0,Conference USA,32.532149,-92.656002
1784,0.7897,3297.0,2020,K,72.0,180.0,33.596568,-117.659405,CA,2,UCLA,0.0,Pac-12,34.161328,-118.167646
