In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import altair as alt

In [2]:
df = pd.read_csv('..\data\imputed_dataset.csv')
df.head()

Unnamed: 0,rating,ranking,year,position,height,weight,latitude,longitude,state_province,stars,...,school,season,wins_rolling_2year,games_played_rolling_2year,post_season_wins_rolling_2year,point_diff_rolling_2year,win_pct_rolling_2year,distance_miles,side_of_ball,position_group
0,0.9644,71.0,2012,DT,76.0,290.0,35.334011,-81.865103,NC,4,...,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154,71.51179,defense,d_line
1,0.9632,79.0,2012,WR,71.0,175.0,35.2272,-80.843083,NC,4,...,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154,119.438575,offense,pass_catcher
2,0.9567,92.0,2012,S,73.0,195.0,30.438083,-84.280933,FL,4,...,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154,304.721993,defense,d_backfield
3,0.9264,187.0,2012,DUAL,75.0,205.0,42.886717,-78.878392,NY,4,...,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154,605.799029,offense,qb
4,0.9133,231.0,2012,ATH,73.0,180.0,35.842865,-90.703452,AR,4,...,Clemson,2012.0,16.0,27.0,0.0,128.0,0.846154,450.556207,athlete,athlete


In [3]:
# Define numerical and categorical values
numerical_features = ['rating', 'ranking', 'height', 'weight', 'distance_miles', 'stars', 'wins_rolling_2year', 'games_played_rolling_2year', 'post_season_wins_rolling_2year', 'point_diff_rolling_2year', 'win_pct_rolling_2year']
categorical_features = ['conference', 'side_of_ball', 'position_group', 'season']
target = ['is_drafted']

In [4]:
# Features and target
X = df[numerical_features+categorical_features]
y = df[target]

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [7]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

  y = column_or_1d(y, warn=True)


In [8]:
accuracy = accuracy_score(y_test, y_pred)

In [9]:
cm = confusion_matrix(y_test, y_pred)

In [10]:
accuracy

0.695615866388309

In [11]:
cm

array([[3071, 1314],
       [ 144,  261]])

In [12]:
def get_feature_names(preprocessor):
    "Get feature names after preprocessing"
    feature_names = []
    for _, transformer, columns in preprocessor.transformers_:
        if hasattr(transformer, 'get_feature_names_out'):
            feature_names.extend(transformer.get_feature_names_out(columns))
        else:
            feature_names.extend(columns)
    return feature_names

transformed_feature_names = get_feature_names(preprocessor)

In [13]:
coefficients = pd.DataFrame({'feature': transformed_feature_names,
                              'value': pipe.named_steps['classifier'].coef_[0]}).sort_values('value', ascending=False)

In [14]:
coefficients.head()

Unnamed: 0,feature,value
0,rating,0.829051
1,ranking,0.419658
14,conference_Big Ten,0.349436
20,conference_SEC,0.3155
39,season_2016.0,0.305498


In [15]:
chart = alt.Chart(coefficients).mark_bar().encode(
    x=alt.X('value', sort=alt.EncodingSortField(field='value', order='descending')),
    y=alt.Y('feature', sort=None)
)

In [16]:
chart