In [131]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import altair as alt

In [132]:
df = pd.read_csv('..\data\imputed_dataset.csv')
df.head()

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,committed_to,is_drafted,conference,latitude_school,longitude_school,distance_miles,side_of_ball,position_group
0,0.9992,1.0,2015,DT,74.0,313.0,31.578206,-84.155681,GA,5,Georgia,0.0,SEC,33.94982,-83.373381,170.049603,defense,d_line
1,0.9991,1.0,2015,DT,74.5,313.0,31.578206,-84.155681,GA,5,Georgia,0.0,SEC,33.94982,-83.373381,170.049603,defense,d_line
2,0.9879,27.0,2015,ATH,72.0,168.0,33.173177,-84.914936,GA,5,Georgia,1.0,SEC,33.94982,-83.373381,103.714733,athlete,athlete
3,0.9769,48.0,2015,OLB,74.0,207.0,32.305158,-84.027407,GA,4,Georgia,1.0,SEC,33.94982,-83.373381,119.771399,defense,linebacker
4,0.968,65.0,2015,SDE,76.0,265.0,33.85327,-84.220073,GA,4,Georgia,0.0,SEC,33.94982,-83.373381,49.011958,defense,d_line


In [133]:
# Define numerical and categorical values
numerical_features = ['rating', 'ranking', 'height', 'weight', 'distance_miles', 'stars']
categorical_features = ['side_of_ball', 'position_group']
target = ['is_drafted']

In [134]:
# Features and target
X = df[numerical_features+categorical_features]
y = df[target]

In [135]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [137]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

  y = column_or_1d(y, warn=True)


In [138]:
accuracy = accuracy_score(y_test, y_pred)

In [139]:
cm = confusion_matrix(y_test, y_pred)

In [140]:
accuracy

0.6837831906956331

In [141]:
cm

array([[2962, 1348],
       [  93,  154]])

In [142]:
def get_feature_names(preprocessor):
    "Get feature names after preprocessing"
    feature_names = []
    for _, transformer, columns in preprocessor.transformers_:
        if hasattr(transformer, 'get_feature_names_out'):
            feature_names.extend(transformer.get_feature_names_out(columns))
        else:
            feature_names.extend(columns)
    return feature_names

transformed_feature_names = get_feature_names(preprocessor)

In [175]:
coefficients = pd.DataFrame({'feature': transformed_feature_names,
                              'value': pipe.named_steps['classifier'].coef_[0]}).sort_values('value', ascending=False)

In [176]:
coefficients.head()

Unnamed: 0,feature,value
0,rating,0.533164
18,position_group_running_back,0.449425
12,position_group_d_backfield,0.279032
9,side_of_ball_special,0.145899
19,position_group_special,0.145899


In [181]:
chart = alt.Chart(coefficients).mark_bar().encode(
    x=alt.X('value', sort=alt.EncodingSortField(field='value', order='descending')),
    y=alt.Y('feature', sort=None)
)

In [182]:
chart