In [1]:
import numpy as np
import pandas as pd
import altair as alt
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)


root_directory = '../../../data/train_test_split/'

X_train = pd.read_csv(root_directory+'x_train_cluster.csv')
y_train = pd.read_csv(root_directory+'y_train.csv')['is_drafted']

X_test = pd.read_csv(root_directory+'x_test_cluster.csv')
y_test = pd.read_csv(root_directory+'y_test.csv')['is_drafted']

In [2]:
# Group conferences together to avoid too many features
conference_group_dict = {'Big Ten': 'power_5', 'SEC': 'power_5', 'Big 12': 'power_5',
                         'Pac-12': 'power_5', 'ACC': 'power_5',
                         
                         'FBS Independents' : 'independent',
                         
                         'Mid-American' : 'group_5', 'Mountrain West' : 'group_5', 
                         'Sun Belt': 'group_5', 'Conference USA': 'group_5', 
                         'American Athletic': 'group_5'}

X_test["conference_group"] = X_test["conference"].map(conference_group_dict)
X_train["conference_group"] = X_train["conference"].map(conference_group_dict)

In [3]:
drop_cols = ['name', 'hometown_city', 'state_province', 'committed_to', 'conference',
             'athlete_id', 'year', 'position', 'hometown_country',
             'Unnamed: 0.1', 'DBSCAN_Cluster_PCA']

X_train = X_train.drop(columns = drop_cols, axis = 1)
X_test = X_test.drop(columns = drop_cols, axis = 1)

In [4]:
cols = list(X_train.columns)

# Identify categorical columns manually
categorical_cols = ['stars', 'side_of_ball', 'conference_group',
                    'position_group', 'SVD_KMeans_Cluster', 'KMeans_Cluster']

# all non-categorical columns are numerical
numerical_cols = set(cols) - set(categorical_cols)
numerical_cols = list(numerical_cols)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Define the preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the preprocessing for numerical columns
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing steps for categorical and numerical columns
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
])

# Establish pipeline defintion
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the pipeline to the training data
pipeline.fit(X_train)

# Transform the training data
X_train_transformed = pipeline.transform(X_train)

In [6]:
# Parameter Tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score

clf = RandomForestClassifier()

# Define the parameter grid for RandomizedSearchCV
# Expanded parameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 800, 1000],  # How many trees
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 100],  # How many splits in the trees are allowed
    'min_samples_split': [2, 5, 10, 15, 20],  # More options for minimum samples split
    'min_samples_leaf': [1, 2, 4, 8, 10],  # How many samples are allowed to comprise a leaf
    'bootstrap': [True, False],  # Whether bootstrap samples are used
    'max_features': ['auto', 'sqrt', 'log2', None],  # Different ways to limit the number of features considered at each split
    'criterion': ['gini', 'entropy'],  # Different criteria for splitting nodes
}


# Randomized Search cross validation

random_search = RandomizedSearchCV(
    clf, 
    param_distributions=param_dist, 
    n_iter=10,  # Number of parameter settings sampled
    scoring="f1_micro",  # Optimize for f1 score (weight
    cv=3,  # 5-fold cross-validation
    verbose=2, 
    random_state=0, 
    n_jobs=-1  # Use all available cores
)

random_search.fit(X_train_transformed, y_train)


# # Print the best parameters and the best score
print("Best parameters found: ", random_search.best_params_)
print("Best F1 score: ", random_search.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=  15.5s
[CV] END bootstrap=True, criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=  15.6s
[CV] END bootstrap=True, criterion=gini, max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=5, n_estimators=400; total time=  15.6s
[CV] END bootstrap=False, criterion=gini, max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.5s
[CV] END bootstrap=False, criterion=gini, max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.5s
[CV] END bootstrap=False, criterion=gini, max_depth=40, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   2.7s
[CV] END bootstrap=True, cr

  warn(


[CV] END bootstrap=True, criterion=entropy, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=  23.3s


  warn(


[CV] END bootstrap=False, criterion=gini, max_depth=60, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=  13.8s


  warn(


[CV] END bootstrap=True, criterion=entropy, max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=20, n_estimators=1000; total time=  40.4s
[CV] END bootstrap=True, criterion=entropy, max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=20, n_estimators=1000; total time=  40.8s
[CV] END bootstrap=True, criterion=entropy, max_depth=30, max_features=sqrt, min_samples_leaf=10, min_samples_split=20, n_estimators=1000; total time=  41.0s
[CV] END bootstrap=True, criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   1.9s
[CV] END bootstrap=True, criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   1.9s
[CV] END bootstrap=True, criterion=entropy, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   1.8s
[CV] END bootstrap=False, criterion=gini, max_depth=60, ma

In [7]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(
    n_estimators = 100,
    min_samples_split = 15,
    min_samples_leaf = 1,
    max_features = 'log2',
    max_depth = 10,
    criterion = 'entropy',
    bootstrap = True, 
    random_state = 0,
    class_weight = 'balanced' # Weighted inversely proportional to how frequently the target appears. 
)


clf.fit(X_train_transformed, y_train)

In [8]:
X_test_transformed = pipeline.transform(X_test)

In [9]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8112


In [10]:
f1_score(y_test, y_pred, average = 'micro')

0.8111946532999165

In [11]:
y_pred.mean()

0.1796157059314954

In [12]:
print('f1 on test: ' + str(f1_score(y_test, y_pred, average = 'micro')))

y_pred = clf.predict(X_train_transformed)
print('f1 on train: ' + str(f1_score(y_train, y_pred, average = 'micro')))

f1 on test: 0.8111946532999165
f1 on train: 0.8430345903191667


In [13]:
y_test.mean()

0.08237259816207185