In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score,classification_report
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

In [5]:
college_statistics = pd.read_csv("archive/college_statistics.csv")
draft_prospects = pd.read_csv("archive/nfl_draft_prospects.csv")



In [6]:
draft_prospects

Unnamed: 0,draft_year,player_id,player_name,position,pos_abbr,school,school_name,school_abbr,link,pick,...,team,team_abbr,team_logo_espn,guid,weight,height,pos_rk,ovr_rk,grade,player_image
0,1967,23590,Bubba Smith,Defensive End,DE,Michigan State,Spartans,MSU,http://insider.espn.com/nfl/draft/player/_/id/...,1.0,...,Baltimore Colts,IND,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,,,,,,,
1,1967,23591,Clinton Jones,Running Back,RB,Michigan State,Spartans,MSU,http://insider.espn.com/nfl/draft/player/_/id/...,2.0,...,Minnesota Vikings,MIN,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,,,,,,,
2,1967,23592,Steve Spurrier,Quarterback,QB,Florida,Gators,FLA,http://insider.espn.com/nfl/draft/player/_/id/...,3.0,...,San Francisco 49ers,SF,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,,,,,,,
3,1967,23593,Bob Griese,Quarterback,QB,Purdue,Boilermakers,PUR,http://insider.espn.com/nfl/draft/player/_/id/...,4.0,...,Miami Dolphins,MIA,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,,,,,,,
4,1967,23594,George Webster,Linebacker,LB,Michigan State,Spartans,MSU,http://insider.espn.com/nfl/draft/player/_/id/...,5.0,...,Houston Oilers,TEN,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,2021,105466,Justus Reed,Defensive End,DE,Virginia Tech,Hokies,VT,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,841ed58127d19410cc2849ea8f3e1070,253.0,75.000,31.0,344.0,30.0,
13350,2021,105467,K.J. Costello,Quarterback,QB,Mississippi State,Bulldogs,MSST,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,151d962cf4bf1862bcef120a11880e57,227.0,76.625,15.0,345.0,30.0,
13351,2021,105468,Donovan Stiner,Safety,S,Florida,Gators,FLA,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,d12830501cce353220df8c6bd2b050fc,205.0,73.500,29.0,346.0,30.0,
13352,2021,105478,Mac McCain III,Cornerback,CB,North Carolina A&T,Aggies,NCAT,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,8d1939ce8d84d85651dcff926bebc08b,186.0,71.000,44.0,348.0,30.0,https://a.espncdn.com/i/headshots/nfldraft/pla...


In [8]:
# Flip the dataframe so that there is only one entry per year per individual

college_statistics_pivot = college_statistics.pivot_table(index=['player_id', 'alt_player_id', 'player_name', 'pos_abbr', 'school', 'school_abbr', 'school_primary_color', 'school_alt_color', 'season', 'active', 'all_star'],
                            columns='statistic',
                            values='value').reset_index()


In [9]:
# Really only care about the most recent year we think so only take the most recent entry according to season

idx = college_statistics_pivot.groupby('player_id')['season'].idxmax()
college_statistics_filtered = college_statistics_pivot.loc[idx]

In [11]:
college_statistics_filtered

statistic,player_id,alt_player_id,player_name,pos_abbr,school,school_abbr,school_primary_color,school_alt_color,season,active,...,Solo Tackles,Total Kicking Points,Total Points,Total Sacks,Total Tackles,Total Touchdowns,Total Two Point Conversions,Yards Per Pass Attempt,Yards Per Reception,Yards Per Rush Attempt
0,368,3924331,Justin Smith,DE,Missouri,MIZ,#000000,#000000,2016,True,...,,,,,,,,,7.0,
2,4019,4239824,Bryan Thomas,DE,UAB,UAB,#003b28,#ffc845,2018,True,...,50.0,,,,68.0,,,,,
5,4542,4240031,Derrick Brooks,LB,Florida State,FSU,#782F40,#ceb888,2020,True,...,4.0,,,,8.0,,,,,
8,4559,4240091,Joe Johnson,DE,Louisville,LOU,#ad000a,#cccccc,2020,True,...,,,,,,,,,10.0,
11,14420,3915189,Royce Smith,OG,Georgia,UGA,#CC0000,#000000,2017,True,...,85.0,,,,137.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,105462,4256041,Manny Jones,DE,Colorado State,CSU,#004537,#ffc425,2020,True,...,7.0,,,,13.0,,,,,
3599,105464,4258170,Tarik Black,WR,Texas,TEX,#EE7524,#f0f0f0,2020,True,...,,,,,,,,,24.0,
3603,105465,4046532,Quinn Nordin,PK,Michigan,MICH,#00274c,#00274c,2020,True,...,,18.0,,,,,,,,
3607,105466,3121655,Justus Reed,DE,Virginia Tech,VT,#74232D,#c2c1ba,2020,True,...,17.0,,,,29.0,,,,,


College Statistics only go through the following years:

- 2014
- 2015
- 2016
- 2017
- 2018
- 2019
- 2020

However the draft picks go back to 1967. We need to remove a lot of data for this to be even somewhat useful

In [37]:
threshold_value = 2014
column_to_check = 'draft_year'  # Adjust this column as needed
draft_prospects_filter = draft_prospects[(draft_prospects[column_to_check] >= threshold_value) & (draft_prospects[column_to_check] < 2021)]

In [38]:
draft_prospects_filter

Unnamed: 0,draft_year,player_id,player_name,position,pos_abbr,school,school_name,school_abbr,link,pick,...,team,team_abbr,team_logo_espn,guid,weight,height,pos_rk,ovr_rk,grade,player_image
10381,2014,33597,Jadeveon Clowney,Defensive End,DE,South Carolina,Gamecocks,SC,http://insider.espn.com/nfl/draft/player/_/id/...,1.0,...,Houston Texans,HOU,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,83ac33664399764944a12a77437351a6,266.0,77.000,1.0,1.0,98.0,
10382,2014,34748,Greg Robinson,Offensive Tackle,OT,Auburn,Tigers,AUB,http://insider.espn.com/nfl/draft/player/_/id/...,2.0,...,St. Louis Rams,STL,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,02d871f9c875e0422dcc75236fa8e7cc,332.0,77.000,1.0,2.0,97.0,
10383,2014,34717,Blake Bortles,Quarterback,QB,UCF,Knights,UCF,http://insider.espn.com/nfl/draft/player/_/id/...,3.0,...,Jacksonville Jaguars,JAX,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,71ac24a6d3424966b5c6f006808d0d9c,232.0,77.000,1.0,18.0,91.0,
10384,2014,33609,Sammy Watkins,Wide Receiver,WR,Clemson,Tigers,CLEM,http://insider.espn.com/nfl/draft/player/_/id/...,4.0,...,Buffalo Bills,BUF,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,c0cc8f4d3a91a03a72bc66f9f0b268eb,211.0,73.000,1.0,4.0,96.0,
10385,2014,34108,Khalil Mack,Outside Linebacker,OLB,Buffalo,Bulls,BUFF,http://insider.espn.com/nfl/draft/player/_/id/...,5.0,...,Oakland Raiders,OAK,https://a.espncdn.com/i/teamlogos/nfl/500/scor...,ff7e62d0e88872c8ffe62d189c5ecdd2,251.0,75.000,1.0,3.0,96.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12983,2020,104149,David Dowell,Safety,S,Michigan State,Spartans,MSU,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,ce03b6d3085022537b4de74d855505f9,206.0,72.500,25.0,301.0,31.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
12984,2020,104197,Trevon Hill,Defensive End,DE,Miami,Hurricanes,MIA,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,7b98ea40cdb58193165965cc0a864452,248.0,74.875,28.0,302.0,31.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
12985,2020,104403,Darius Anderson,Running Back,RB,TCU,Horned Frogs,TCU,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,43f7ee51c02aeb61a7f78fc99bce9807,208.0,70.500,30.0,303.0,31.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
12986,2020,104661,Nevelle Clarke,Cornerback,CB,UCF,Knights,UCF,http://insider.espn.com/nfl/draft/player/_/id/...,,...,,,,273551d0dbb5f7d5a322b6a141a4fcd6,190.0,72.875,32.0,305.0,31.0,


In [39]:
merged_df = pd.merge(college_statistics_filtered, draft_prospects_filter, on='player_id', how='left')

We do a left join because we want to keep all potential individuals. If they have an NA next to their draft pick number, it means they weren't selected and that's useful information for us!

In [40]:
merged_df['picked']= merged_df['pick'].isna()
merged_df['picked']= merged_df['picked']==False

In [41]:
merged_df_1 = merged_df['pos_abbr_x'].value_counts()

categories_to_drop = merged_df_1[merged_df_1 < 10].index

df_filtered = merged_df[~merged_df['pos_abbr_x'].isin(categories_to_drop)]

In [42]:
numeric_df =  df_filtered.select_dtypes(include='number')
numeric_df['pos_abbr_x']= df_filtered['pos_abbr_x']

In [43]:
X = numeric_df.drop(['pick','round'], axis=1)  # Features
y = df_filtered['round']  # Target variable
y.fillna(8, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


positions = df_filtered['pos_abbr_x'].unique()
models = {}


metrics_dict = {
    'DE': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles','weight', 'height', 'Assist Tackles'],
    'OG': ['Total Tackles','weight', 'height', 'Assist Tackles', 'Solo Tackles'],
    'WR': ['Receiving Touchdowns', 'Receiving Yards', 'Receptions','weight', 'height'],
    'QB': ['Completion Percentage', 'Completions', 'Passer Rating', 'Passing Attempts', 'Passing Touchdowns', 'Passing Yards', 'Yards Per Pass Attempt','weight', 'height'],
    'C': ['Total Tackles','weight', 'height', 'Assist Tackles'],
    'S': ['Interception Touchdowns', 'Interception Yards', 'Interceptions', 'Solo Tackles', 'Total Tackles','weight', 'height'],
    'TE': ['Receiving Touchdowns', 'Receiving Yards', 'Receptions','weight', 'height'],
    'CB': ['Interception Touchdowns', 'Interception Yards', 'Interceptions', 'Passes Defended', 'Solo Tackles', 'Total Tackles','weight', 'height'],
    'RB': ['Rushing Attempts', 'Rushing Touchdowns', 'Rushing Yards'],
    'DT': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles','weight', 'height', 'Assist Tackles'],
    'OLB': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles','weight', 'height'],
    'OT': ['Total Tackles', 'Assist Tackles','weight', 'height', 'Solo Tackles'],
    'ILB': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles', 'Assist Tackles','weight', 'height'],
    'FB': ['Rushing Attempts', 'Rushing Touchdowns', 'Rushing Yards','weight', 'height'],
    'P': ['Gross Average Punt Yards', 'Punt Return Fair Catches', 'Punt Return Touchdowns', 'Punt Return Yards', 'Punt Returns', 'Punt Yards', 'Punts','weight', 'height'],
    'PK': ['Extra Points Made', 'FGM 1-19 yards', 'FGM 20-29 yards', 'FGM 30-39 yards', 'FGM 40-49 yards', 'FGM 50+ yards', 'Field Goal Attempts', 'Field Goal Made', 'Field Goal Percentage', 'Field Goals', 'Kick Extra Points', 'Total Kicking Points', 'Total Points', 'Yards Per Rush Attempt','weight', 'height'],
    'LS': ['Total Tackles', 'Assist Tackles','weight', 'height', 'Solo Tackles']
}

a_list=[]
m_list=[]
for position in positions:
    # Filter data for the specific position
    X_train_pos = X_train[X_train['pos_abbr_x'] == position]
    y_train_pos = y_train[X_train['pos_abbr_x'] == position]
    X_test_pos = X_test[X_test['pos_abbr_x'] == position]
    y_test_pos = y_test[X_test['pos_abbr_x'] == position]
    
    X_train_pos = X_train_pos.drop(['pos_abbr_x'], axis=1)
    X_test_pos = X_test_pos.drop(['pos_abbr_x'], axis=1)


    selected_metrics = metrics_dict[position]
    
    X_train_pos = X_train_pos[selected_metrics]
    
    X_test_pos = X_test_pos[selected_metrics]
    
    X_train_pos.fillna(0, inplace=True)
    X_test_pos.fillna(0, inplace=True)
    
    model = LinearRegression()  
    model.fit(X_train_pos, y_train_pos)
    
    # Save the model
    models[position] = model

# Step 4: Model Evaluation
for position, model in models.items():
    X_test_pos = X_test[X_test['pos_abbr_x'] == position]
    y_test_pos = y_test[X_test['pos_abbr_x'] == position]
    X_test_pos = X_test_pos.drop(['pos_abbr_x'], axis=1)
    selected_metrics = metrics_dict[position]

    X_test_pos = X_test_pos[selected_metrics]
    X_test_pos.fillna(0, inplace=True)

    y_pred = model.predict(X_test_pos)

    mse = mean_squared_error(y_test_pos, y_pred)
    m_list.append(mse)

    
    print(f"Position: {position}")
    print(f"Mean Squared Error: {mse}")
    print("Top 5 Coefficients:")
    if isinstance(model, LinearRegression):
        coef_names = list(X_test_pos.columns)
        coef_values = list(model.coef_)

        sorted_coef = sorted(zip(coef_names, coef_values), key=lambda x: abs(x[1]), reverse=True)
        
        for feature, coef in sorted_coef[:5]:
            print(f"{feature}: {coef}")
    print()

print(np.mean(m_list))


Position: DE
Mean Squared Error: 5.811735618288727
Top 5 Coefficients:
Forced Fumbles: -0.37636395921400473
Sacks: -0.06041191914891572
height: 0.03618384889167752
Solo Tackles: 0.03356094058529221
Assist Tackles: -0.03127983458894953

Position: OG
Mean Squared Error: 167.77584884194263
Top 5 Coefficients:
Solo Tackles: -0.5469080213670998
Assist Tackles: 0.44868555088216183
height: -0.23232896478570672
Total Tackles: -0.09822247048493825
weight: 0.047168159167069776

Position: WR
Mean Squared Error: 5.442043865002823
Top 5 Coefficients:
height: 0.1546828352376846
weight: -0.06562540661754336
Receiving Touchdowns: -0.05740441837472573
Receptions: 0.020903305223393346
Receiving Yards: -0.0018039653038272518

Position: QB
Mean Squared Error: 8.111898603458142
Top 5 Coefficients:
Yards Per Pass Attempt: 5.082294060913293
Passer Rating: -0.42695965776344624
Completion Percentage: 0.39032978930123086
Passing Touchdowns: 0.3527432938455145
Completions: 0.05572121897540161

Position: C
Mean S

In [57]:

X = numeric_df.drop(['pick','round'], axis=1)  # Features
y = df_filtered['picked']  # Target variable
y.fillna(0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

positions = df_filtered['pos_abbr_x'].unique()
models = {}


metrics_dict = {
    'DE': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles','weight', 'height', 'Assist Tackles'],
    'OG': ['Total Tackles','weight', 'height', 'Assist Tackles', 'Solo Tackles'],
    'WR': ['Receiving Touchdowns', 'Receiving Yards', 'Receptions','weight', 'height'],
    'QB': ['Completion Percentage', 'Completions', 'Passer Rating', 'Passing Attempts', 'Passing Touchdowns', 'Passing Yards', 'Yards Per Pass Attempt','weight', 'height'],
    'C': ['Total Tackles','weight', 'height', 'Assist Tackles'],
    'S': ['Interception Touchdowns', 'Interception Yards', 'Interceptions', 'Solo Tackles', 'Total Tackles','weight', 'height'],
    'TE': ['Receiving Touchdowns', 'Receiving Yards', 'Receptions','weight', 'height'],
    'CB': ['Interception Touchdowns', 'Interception Yards', 'Interceptions', 'Passes Defended', 'Solo Tackles', 'Total Tackles','weight', 'height'],
    'RB': ['Rushing Attempts', 'Rushing Touchdowns', 'Rushing Yards'],
    'DT': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles','weight', 'height', 'Assist Tackles'],
    'OLB': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles','weight', 'height'],
    'OT': ['Total Tackles', 'Assist Tackles','weight', 'height', 'Solo Tackles'],
    'ILB': ['Forced Fumbles', 'Sacks', 'Solo Tackles', 'Total Sacks', 'Total Tackles', 'Assist Tackles','weight', 'height'],
    'FB': ['Rushing Attempts', 'Rushing Touchdowns', 'Rushing Yards','weight', 'height'],
    'P': ['Gross Average Punt Yards', 'Punt Return Fair Catches', 'Punt Return Touchdowns', 'Punt Return Yards', 'Punt Returns', 'Punt Yards', 'Punts','weight', 'height'],
    'PK': ['Extra Points Made', 'FGM 1-19 yards', 'FGM 20-29 yards', 'FGM 30-39 yards', 'FGM 40-49 yards', 'FGM 50+ yards', 'Field Goal Attempts', 'Field Goal Made', 'Field Goal Percentage', 'Field Goals', 'Kick Extra Points', 'Total Kicking Points', 'Total Points', 'Yards Per Rush Attempt','weight', 'height'],
    'LS': ['Total Tackles', 'Assist Tackles','weight', 'height', 'Solo Tackles']
}

a_list=[]
m_list=[]
for position in positions:
    # Filter data for the specific position
    X_train_pos = X_train[X_train['pos_abbr_x'] == position]
    y_train_pos = y_train[X_train['pos_abbr_x'] == position]
    X_test_pos = X_test[X_test['pos_abbr_x'] == position]
    y_test_pos = y_test[X_test['pos_abbr_x'] == position]
    
    X_train_pos = X_train_pos.drop(['pos_abbr_x'], axis=1)
    X_test_pos = X_test_pos.drop(['pos_abbr_x'], axis=1)

    
    # Select metrics for the specific position
    selected_metrics = metrics_dict[position]
    
    # Filter features based on selected metrics
    X_train_pos = X_train_pos[selected_metrics]
    
    X_test_pos = X_test_pos[selected_metrics]
    
#     print(X_test_pos)
    X_train_pos.fillna(0, inplace=True)
    X_test_pos.fillna(0, inplace=True)
    
    # Build model
#     model = tf.keras.Sequential([
#         tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_pos.shape[1],)),
#         tf.keras.layers.Dense(64, activation='relu'),
#         tf.keras.layers.Dense(64, activation='relu'),
#         tf.keras.layers.Dense(32, activation='relu'),
#         tf.keras.layers.Dense(1, activation='softmax')  # Output layer with 3 units for 3 classes
#     ])
# 
    # Compile the model
#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
#     model.fit(X_train_pos, y_train_pos,epochs=20)
    
    model = RandomForestClassifier(max_depth=20,bootstrap=True,warm_start=True,max_samples=7)
    model.fit(X_train_pos,y_train_pos)
    # Save the model
    models[position] = model

# Step 4: Model Evaluation
for position, model in models.items():
    X_test_pos = X_test[X_test['pos_abbr_x'] == position]
    y_test_pos = y_test[X_test['pos_abbr_x'] == position]
    X_test_pos = X_test_pos.drop(['pos_abbr_x'], axis=1)
    selected_metrics = metrics_dict[position]

    X_test_pos = X_test_pos[selected_metrics]
    X_test_pos.fillna(0, inplace=True)

    y_pred = model.predict(X_test_pos)

#     mse = mean_squared_error(y_test_pos, y_pred)
#     m_list.append(mse)


    accuracy = accuracy_score(y_test_pos, y_pred)
    a_list.append(accuracy)
    report = classification_report(y_test_pos, y_pred)
    
    print(f"Position: {position}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
#     print(f"Position: {position}")
#     print(f"Mean Squared Error: {mse}")

print(np.mean(a_list))

Position: DE
Accuracy: 0.8
Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.60      0.75        10
        True       0.71      1.00      0.83        10

    accuracy                           0.80        20
   macro avg       0.86      0.80      0.79        20
weighted avg       0.86      0.80      0.79        20

Position: OG
Accuracy: 0.875
Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.75      0.86         4
        True       0.80      1.00      0.89         4

    accuracy                           0.88         8
   macro avg       0.90      0.88      0.87         8
weighted avg       0.90      0.88      0.87         8

Position: WR
Accuracy: 0.6
Classification Report:
              precision    recall  f1-score   support

       False       0.57      0.63      0.60        19
        True       0.63      0.57      0.60        21

    accuracy                  