In [70]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import matplotlib as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [71]:
df = pd.read_csv("archive/merged_data.csv")

In [72]:
df.head()

Unnamed: 0.1,Unnamed: 0,player_id,alt_player_id,player_name_x,pos_abbr_x,school_x,school_abbr_x,school_primary_color,school_alt_color,season,...,team,team_abbr,team_logo_espn,guid,weight,height,pos_rk,ovr_rk,grade,player_image
0,0,368,3924331,Justin Smith,DE,Missouri,MIZ,#000000,#000000,2016,...,,,,,,,,,,
1,1,4019,4239824,Bryan Thomas,DE,UAB,UAB,#003b28,#ffc845,2018,...,,,,,,,,,,
2,2,4542,4240031,Derrick Brooks,LB,Florida State,FSU,#782F40,#ceb888,2020,...,,,,,,,,,,
3,3,4559,4240091,Joe Johnson,DE,Louisville,LOU,#ad000a,#cccccc,2020,...,,,,,,,,,,
4,4,14420,3915189,Royce Smith,OG,Georgia,UGA,#CC0000,#000000,2017,...,,,,,,,,,,


In [73]:
columns_to_drop = [
    'Unnamed: 0', 'player_id', 'player_name_x', 'alt_player_id', 'school_abbr_x', 
    'school_primary_color', 'school_alt_color', 'pos_abbr_y', 'school_y', 
    'school_abbr_y', 'pick', 'overall', 'team_logo_espn', 'player_image', 
    'player_name_y', 'traded', 'trade_note', 'link', 'team_abbr', 'guid', 'draft_year'
]

df = df.drop(columns_to_drop, axis=1, errors='ignore')

df['round'] = df['round'].fillna(8)

# Now safely convert 'round' to integer
df['round'] = df['round'].astype(int)

# Create dummy variables for categorical features
df = pd.get_dummies(df, columns=['school_x', 'pos_abbr_x', 'school_name', 'position', 'team'])

# Now define the target variable
y = df['round']

# Fill NaN values in the entire dataframe with 0
df.fillna(0, inplace=True)

# Drop the 'round' column from the features now that it's no longer needed
df.drop(['round'], axis=1, inplace=True, errors='ignore')

# Your features are the remaining columns
X = df


In [74]:
y

0       8
1       8
2       8
3       8
4       8
       ..
1121    8
1122    8
1123    8
1124    8
1125    8
Name: round, Length: 1126, dtype: int64

In [75]:
X

Unnamed: 0,season,active,all_star,Assist Tackles,Completion Percentage,Completions,Extra Points Made,FGM 1-19 yards,FGM 20-29 yards,FGM 30-39 yards,...,team_New York Jets,team_Oakland Raiders,team_Philadelphia Eagles,team_Pittsburgh Steelers,team_San Francisco 49ers,team_Seattle Seahawks,team_Tampa Bay Buccaneers,team_Tennessee Titans,team_Washington,team_Washington Redskins
0,2016,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2018,True,False,18.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,2020,True,False,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
3,2020,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,2017,True,False,52.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121,2020,True,False,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1122,2020,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1123,2020,True,False,0.0,0.0,0.0,12.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1124,2020,True,False,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [77]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [78]:
svr_model = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2, kernel='rbf'))
svr_model.fit(X_train, y_train)

In [81]:
y_pred = svr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 3.494039806693775
Mean Absolute Error: 1.4543350766692102


In [90]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Example range
    'gamma': ['scale', 'auto', 0.01, 0.001],  # Example range
    'kernel': ['rbf', 'poly'],
    'degree': [2, 3, 4]  # Only relevant for 'poly' kernel
}

svr = SVR()

# Create the GridSearchCV object with a regression scoring metric
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error')  # Using negative MSE

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# The best hyperparameters found and the best score
best_parameters = grid_search.best_params_
best_score = -grid_search.best_score_  # Note this is negative MSE

print(f"Best parameters: {best_parameters}")
print(f"Best cross-validated score (MSE): {best_score}")


Best parameters: {'C': 100, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
Best cross-validated score (negative MSE): 1.7749444246585024


In [91]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# Define the parameter grid, note how we need to prepend 'svr__' to parameter names
param_grid = {
    'svr__C': [0.1, 1, 10, 100],  # Example range
    'svr__gamma': ['scale', 'auto', 0.01, 0.001],  # Example range
    'svr__kernel': ['rbf', 'poly'],
    'svr__degree': [2, 3, 4]  # Only relevant for 'poly' kernel
}

# Create a pipeline with a scaler and an SVR
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Create the GridSearchCV object with a regression scoring metric
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# The best hyperparameters found and the best score
best_parameters = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative MSE to positive for easier interpretation

print(f"Best parameters: {best_parameters}")
print(f"Best cross-validated score (MSE): {best_score}")


Best parameters: {'svr__C': 100, 'svr__degree': 2, 'svr__gamma': 0.001, 'svr__kernel': 'rbf'}
Best cross-validated score (MSE): 1.9020742884206026
