In [1]:
import numpy as np
import pandas as pd
import altair as alt
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)


root_directory = '../../../data/train_test_split/'

X_train = pd.read_csv(root_directory+'x_train_cluster.csv')
y_train = pd.read_csv(root_directory+'y_train.csv')['is_drafted']

X_test = pd.read_csv(root_directory+'x_test_cluster.csv')
y_test = pd.read_csv(root_directory+'y_test.csv')['is_drafted']

In [2]:
X_test['from_USA_flag'] = np.where(X_test['hometown_country'] == 'USA', 1, 0)
X_train['from_USA_flag'] = np.where(X_train['hometown_country'] == 'USA', 1, 0)

In [3]:
# Group conferences together to avoid too many features
conference_group_dict = {'Big Ten': 'power_5', 'SEC': 'power_5', 'Big 12': 'power_5',
                         'Pac-12': 'power_5', 'ACC': 'power_5',
                         
                         'FBS Independents' : 'independent',
                         
                         'Mid-American' : 'group_5', 'Mountrain West' : 'group_5', 
                         'Sun Belt': 'group_5', 'Conference USA': 'group_5', 
                         'American Athletic': 'group_5'}

X_test["conference_group"] = X_test["conference"].map(conference_group_dict)
X_train["conference_group"] = X_train["conference"].map(conference_group_dict)

In [4]:
drop_cols = ['name', 'hometown_city', 'state_province', 'committed_to', 'conference',
             'athlete_id', 'hometown_country', 'year', 'position', 'Unnamed: 0.1',
            'DBSCAN_Cluster_PCA']

X_train = X_train.drop(columns = drop_cols, axis = 1)
X_test = X_test.drop(columns = drop_cols, axis = 1)

In [5]:
cols = list(X_train.columns)

# Identify categorical columns manually
categorical_cols = ['stars', 'side_of_ball', 'conference_group',
                    'position_group', 'SVD_KMeans_Cluster', 'KMeans_Cluster', 'from_USA_flag']

# all non-categorical columns are numerical
numerical_cols = set(cols) - set(categorical_cols)
numerical_cols = list(numerical_cols)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Define the preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the preprocessing for numerical columns
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing steps for categorical and numerical columns
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
])

# Establish pipeline defintion
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the pipeline to the training data
pipeline.fit(X_train)

# Transform the training data
X_train_transformed = pipeline.transform(X_train)

In [7]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)
clf.fit(X_train_transformed, y_train)

In [8]:
X_test_transformed = pipeline.transform(X_test)

In [9]:
from sklearn.metrics import accuracy_score, f1_score


y_pred = clf.predict(X_test_transformed)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9168


In [10]:
y_pred.mean()

0.014870509607351713

In [11]:
print('f1 on test: ' + str(f1_score(y_test, y_pred, average = 'micro')))

y_pred = clf.predict(X_train_transformed)
print('f1 on train: ' + str(f1_score(y_train, y_pred, average = 'micro')))

f1 on test: 0.9167919799498747
f1 on train: 0.9995543920236172


In [12]:
# from sklearn.inspection import permutation_importance

# # X_test_transformed_arr = X_test_transformed.toarray()

# result = permutation_importance(
#     clf, X_test_transformed, y_test, n_repeats=15, random_state=42, n_jobs=-1
# )

# imp = result.importances_mean
features = pipeline.get_feature_names_out()

a = zip(features, clf[-1].feature_importances_)
imp_df = pd.DataFrame(a, columns = ['feature', 'mean_loss_in_acc'])

In [13]:
clf

In [14]:
imp_df.head(10)

Unnamed: 0,feature,mean_loss_in_acc
0,cat__stars_1,0.000354
1,cat__stars_2,0.001058
2,cat__stars_3,0.004717
3,cat__stars_4,0.001
4,cat__stars_5,0.025659
5,cat__side_of_ball_athlete,0.003397
6,cat__side_of_ball_defense,0.00499
7,cat__side_of_ball_offense,0.007451
8,cat__side_of_ball_special,0.000784
9,cat__conference_group_group_5,0.004376


In [16]:
alt.Chart(imp_df).mark_bar().encode(
    alt.Y('feature', sort='x'), x = 'mean_loss_in_acc').properties(
    title = 'Mean Decrease in Impurity (MDI) by Feature')

In [17]:
# Notes on features importance:
# 1) Stars 1 and 2 don't matter. Create stars 1-3, 4, and 5. 
# 2) USA Flag is irrelevant - should drop country of origin.