In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/M2_final.csv')
df.head()

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school
0,0.9992,1.0,2015,DT,74.0,313.0,31.578206,-84.155681,GA,5,Georgia,0.0,SEC,33.94982,-83.373381
1,0.9991,1.0,2015,DT,74.5,313.0,31.578206,-84.155681,GA,5,Georgia,0.0,SEC,33.94982,-83.373381
2,0.9879,27.0,2015,ATH,72.0,168.0,33.173177,-84.914936,GA,5,Georgia,1.0,SEC,33.94982,-83.373381
3,0.9769,48.0,2015,OLB,74.0,207.0,32.305158,-84.027407,GA,4,Georgia,1.0,SEC,33.94982,-83.373381
4,0.968,65.0,2015,SDE,76.0,265.0,33.85327,-84.220073,GA,4,Georgia,0.0,SEC,33.94982,-83.373381


In [3]:
df.shape

(22784, 15)

In [4]:
# Impute ranking with a regression using rating
# Impute hometown latitude / longitude with that state's average
# Impute height / weight with that position's mean
# Replace null positions and home state with 'unknown'

df.isna().sum()

rating                0
ranking              38
year                  0
position              1
height               28
weight               31
latitude            254
longitude           254
state_province      130
stars                 0
committed_to          0
is_drafted            0
conference            0
latitude_school       0
longitude_school      0
dtype: int64

In [5]:
df.shape

(22784, 15)

In [6]:
def lat_long_imputation(data, group_by_col, missing_data_col, backup_replace_col):
    """
    This function imputes missing values by looking up the average value of the 
    missing column for a known categorical value. 

    For example, if a recruit has missing hometown coordinates, then we will 
    look up the average latitude / longitude for that recruit's state. 

    If the data is still missing after looking up missing values, then we'll fill with the column's mean. 

    args:
    data (dataframe): our dataframe
    group_by_col (string): the name of the column that we will use to look up the average (it would be the state in the example above).
    missing_data_col (string): the column that has missing data we're trying to impute.
    backup_replace_col (string): if still missing values, this is the value in the dataset we replace missing ones with. 
    """

    # Create look-up table to get average for that value. 
    lookup_df = data.groupby(by = group_by_col).agg({missing_data_col: 'max'}).reset_index()
    
    # join to lookup table to get the missing value
    data = pd.merge(left = data, right = lookup_df, how = 'left', on = group_by_col, suffixes = ('', '_imputed'))
    
    # Fill with lookup table's value
    data[missing_data_col] = data[missing_data_col].fillna(data[missing_data_col])

    # If still null fill in with another column
    data[missing_data_col] = data[missing_data_col].fillna(data[backup_replace_col])

    # If STILL null, fill with missing column median
    data[missing_data_col] = data[missing_data_col].fillna(data[missing_data_col].median())
    
    # Drop columns used for imputation
    data = data.drop(columns = missing_data_col + '_imputed')
    
    return data

df = lat_long_imputation(df, 'state_province', 'latitude', 'latitude_school')
df = lat_long_imputation(df, 'state_province', 'longitude', 'longitude_school')

In [7]:
def height_weight_imputation(data, group_by_col, missing_data_col):
    """
    This function imputes missing values by looking up the average value of the 
    missing column for a known categorical value. 

    For example, if a recruit has missing height or weight, then we will 
    look up the average height/weight for that recruit's position. 

    If the data is still missing after looking up missing values, then we'll fill with the column's mean. 

    args:
    data (dataframe): our dataframe
    group_by_col (string): the name of the column that we will use to look up the average (it would be the state in the example above).
    missing_data_col (string): the column that has missing data we're trying to impute.
    """

    # Create look-up table to get average for that value. 
    lookup_df = data.groupby(by = group_by_col).agg({missing_data_col: 'max'}).reset_index()
    
    # join to lookup table to get the missing value
    data = pd.merge(left = data, right = lookup_df, how = 'left', on = group_by_col, suffixes = ('', '_imputed'))
    
    # fill with imputed amount
    data[missing_data_col] = data[missing_data_col].fillna(data[missing_data_col + '_imputed'])

    # If STILL null, fill with missing column median
    data[missing_data_col] = data[missing_data_col].fillna(data[missing_data_col].median())
    
    # Drop columns used for imputation
    data = data.drop(columns = missing_data_col + '_imputed')
    
    return data


df = height_weight_imputation(df, 'position', 'height')
df = height_weight_imputation(df, 'position', 'weight')    

In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

def lin_reg_imputation(data, x_col, pred_col):

    # Separate the rows where the target column has missing values
    missing_values = data[data[pred_col].isnull()]
    non_missing_values = data.dropna(subset=[pred_col, x_col])
    
    # Train a linear regression model
    X_train = non_missing_values[[x_col]]
    y_train = non_missing_values[pred_col]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict missing values
    X_missing = missing_values[[x_col]]
    data.loc[data[pred_col].isnull(), pred_col] = model.predict(X_missing)
    
    return data

df = lin_reg_imputation(df, 'rating', 'ranking')



In [9]:
import pandas as pd
import numpy as np

# Function to calculate distance between two sets of coordinates taken from this stack overflow post:
# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points

# Function to calculate distance between hometown and school (in miles)
def haversine(lat1, lon1, lat2, lon2):
    # Radius of Earth in miles
    R = 3958.8
    
    # Convert degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    # Distance in kilometers
    distance = R * c
    return distance

# Apply the function to the DataFrame
df['distance_miles'] = df.apply(lambda row: haversine(row['latitude'], row['longitude'], row['latitude_school'], row['longitude_school']), axis=1)

In [10]:
# feature engineering...
position_off_def_dict = {'DT': 'defense', 'ATH': 'athlete', 'OLB': 'defense', 'SDE': 'defense'
                       , 'WDE': 'defense', 'S': 'defense', 'CB': 'defense'
                       , 'TE': 'offense', 'OT': 'offense', 'WR': 'offense'
                       , 'ILB': 'defense', 'OG': 'offense', 'PRO': 'offense'
                       , 'K': 'special', 'RB': 'offense', 'P': 'special', 'DUAL': 'offense', 'APB': 'offense'
                       , 'OC': 'defense', 'DL': 'defense', 'EDGE': 'defense', 'IOL': 'offense'
                       , 'LB': 'defense', 'QB': 'offense', 'FB': 'offense', 'LS': 'special', 'nan': 'unknown'}

df["side_of_ball"] = df["position"].map(position_off_def_dict)

position_group_dict = {'DT': 'd_line', 'ATH': 'athlete', 'OLB': 'linebacker', 'SDE': 'd_line'
                       , 'WDE': 'd_line', 'S': 'd_backfield', 'CB': 'd_backfield'
                       , 'TE': 'pass_catcher', 'OT': 'o_line', 'WR': 'pass_catcher'
                       , 'ILB': 'linebacker', 'OG': 'o_line', 'PRO': 'qb'
                       , 'K': 'special', 'RB': 'running_back', 'P': 'special', 'DUAL': 'qb', 'APB': 'running_back'
                       , 'OC': 'd_line', 'DL': 'd_line', 'EDGE': 'd_line', 'IOL': 'o_line'
                       , 'LB': 'linebacker', 'QB': 'qb', 'FB': 'running_back', 'LS': 'special'}

df["position_group"] = df["position"].map(position_group_dict)

In [11]:
df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school,distance_miles,side_of_ball,position_group
15660,0.8762,529.0,2020,WDE,76.0,230.0,38.894985,-77.036571,DC,3,Purdue,0.0,Big Ten,40.435225,-86.918684,535.965641,defense,d_line
11790,0.845,949.0,2016,TE,77.0,225.0,42.075732,-87.719377,IL,3,Northwestern,0.0,Big Ten,42.065399,-87.692475,1.553527,offense,pass_catcher
10777,0.7891,2429.0,2017,DT,77.0,330.0,38.833958,-104.825349,CO,2,Colorado,0.0,Big 12,40.009475,-105.266905,84.571168,defense,d_line
12759,0.8104,2190.0,2021,SDE,74.0,240.0,33.749099,-84.390185,GA,3,Ohio,0.0,Mid-American,39.321279,-82.103431,405.353222,defense,d_line
14635,0.816,1890.0,2018,OLB,71.0,205.0,33.85327,-84.220073,GA,3,Iowa State,0.0,Big 12,42.013998,-93.635772,761.409802,defense,linebacker


In [12]:
def fill_col_unknown(data, unknown_col_name):
    data[unknown_col_name] = data[unknown_col_name].fillna('unknown')

    return data

data = fill_col_unknown(df, 'position')
data = fill_col_unknown(df, 'state_province')
data = fill_col_unknown(df, 'side_of_ball')
data = fill_col_unknown(df, 'position_group')

In [13]:
df.isna().sum()

rating              0
ranking             0
year                0
position            0
height              0
weight              0
latitude            0
longitude           0
state_province      0
stars               0
committed_to        0
is_drafted          0
conference          0
latitude_school     0
longitude_school    0
distance_miles      0
side_of_ball        0
position_group      0
dtype: int64

In [14]:
df.shape

(22784, 18)

In [15]:
df.sample(5)

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school,distance_miles,side_of_ball,position_group
17377,0.779,2714.0,2017,OLB,73.0,205.0,39.758948,-84.191607,OH,2,Ball State,0.0,Mid-American,40.215942,-85.416115,72.104401,defense,linebacker
3163,0.8595,884.0,2019,TE,77.0,215.0,30.165471,-81.701635,FL,3,Baylor,0.0,Big 12,31.558201,-97.11567,918.49831,offense,pass_catcher
11684,0.8516,1086.0,2019,ATH,73.0,190.0,33.596568,-117.659405,CA,3,Oregon State,0.0,Pac-12,44.559456,-123.281434,814.761526,athlete,athlete
13840,0.8826,530.0,2022,DL,74.0,250.0,26.122308,-80.143379,FL,3,Indiana,0.0,Big Ten,39.180896,-86.525622,974.971734,defense,d_line
19284,0.8663,857.0,2022,QB,76.0,205.0,36.030113,-114.982619,NV,3,UNLV,0.0,Mountain West,36.167256,-115.148516,13.250253,offense,qb


In [16]:
df.to_csv('imputed_dataset.csv', index = False)