In [164]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [165]:
df = pd.read_csv("archive/merged_data.csv")

In [166]:
df.head()

Unnamed: 0.1,Unnamed: 0,player_id,alt_player_id,player_name_x,pos_abbr_x,school_x,school_abbr_x,school_primary_color,school_alt_color,season,...,team,team_abbr,team_logo_espn,guid,weight,height,pos_rk,ovr_rk,grade,player_image
0,0,368,3924331,Justin Smith,DE,Missouri,MIZ,#000000,#000000,2016,...,,,,,,,,,,
1,1,4019,4239824,Bryan Thomas,DE,UAB,UAB,#003b28,#ffc845,2018,...,,,,,,,,,,
2,2,4542,4240031,Derrick Brooks,LB,Florida State,FSU,#782F40,#ceb888,2020,...,,,,,,,,,,
3,3,4559,4240091,Joe Johnson,DE,Louisville,LOU,#ad000a,#cccccc,2020,...,,,,,,,,,,
4,4,14420,3915189,Royce Smith,OG,Georgia,UGA,#CC0000,#000000,2017,...,,,,,,,,,,


In [167]:
columns_to_drop = [
    'player_id', 'player_name_x', 'alt_player_id', 'school_abbr_x', 
    'school_primary_color', 'school_alt_color', 'pos_abbr_y', 'school_y', 
    'school_abbr_y', 'pick', 'overall', 'team_logo_espn', 'player_image', 
    'player_name_y', 'traded', 'trade_note', 'link', 'team_abbr', 'guid'
]

# Create dummy variables for categorical features
df = pd.get_dummies(df, columns=['school_x', 'pos_abbr_x', 'school_name', 'position', 'team'])

# Create a binary column for drafted/not drafted
df['drafted'] = ~df['round'].isna().astype(int)

# Now define the target variable
y = df['drafted']

# Drop the unnecessary columns, including those you created dummies from if they're no longer needed
df = df.drop(columns_to_drop + ['round'], axis=1, errors='ignore')

# Fill NaN values with 0
df = df.fillna(0)

# Your features are the remaining columns
X = df.drop(['drafted'], axis=1)


In [168]:
X.head()

Unnamed: 0.1,Unnamed: 0,season,active,all_star,Assist Tackles,Completion Percentage,Completions,Extra Points Made,FGM 1-19 yards,FGM 20-29 yards,...,team_New York Jets,team_Oakland Raiders,team_Philadelphia Eagles,team_Pittsburgh Steelers,team_San Francisco 49ers,team_Seattle Seahawks,team_Tampa Bay Buccaneers,team_Tennessee Titans,team_Washington,team_Washington Redskins
0,0,2016,True,False,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,1,2018,True,False,18.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,2,2020,True,False,4.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,3,2020,True,False,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,4,2017,True,False,52.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [170]:
# basic SVM 
lin_model = SVC(kernel='linear', random_state=42)
lin_model.fit(X_train, y_train)

In [171]:
y_pred = lin_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[186  24]
 [ 36 205]]
              precision    recall  f1-score   support

          -2       0.84      0.89      0.86       210
          -1       0.90      0.85      0.87       241

    accuracy                           0.87       451
   macro avg       0.87      0.87      0.87       451
weighted avg       0.87      0.87      0.87       451


In [172]:
poly_svm = SVC(kernel='poly', random_state=42)
poly_svm.fit(X_train, y_train)

In [173]:
y_pred = poly_svm.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[112  98]
 [  3 238]]
              precision    recall  f1-score   support

          -2       0.97      0.53      0.69       210
          -1       0.71      0.99      0.82       241

    accuracy                           0.78       451
   macro avg       0.84      0.76      0.76       451
weighted avg       0.83      0.78      0.76       451


In [176]:
rbf_svm = SVC(kernel="rbf", random_state=42)
rbf_svm.fit(X_train, y_train)

In [177]:
y_pred = rbf_svm.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[108 102]
 [  0 241]]
              precision    recall  f1-score   support

          -2       1.00      0.51      0.68       210
          -1       0.70      1.00      0.83       241

    accuracy                           0.77       451
   macro avg       0.85      0.76      0.75       451
weighted avg       0.84      0.77      0.76       451
