In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import altair as alt

In [2]:
X_train = pd.read_csv('../../data/train_test_split/x_train.csv')
X_test = pd.read_csv('../../data/train_test_split/x_test.csv')
y_train = pd.read_csv('../../data/train_test_split/y_train.csv').drop(columns = 'Unnamed: 0').values.ravel()
y_test = pd.read_csv('../../data/train_test_split/y_test.csv').drop(columns = 'Unnamed: 0').values.ravel()

In [3]:
# Define numerical and categorical values
numerical_features = ['rating', 'ranking', 'height', 'weight', 'distance_miles', 'stars', 'wins_rolling_2year', 'games_played_rolling_2year', 'post_season_wins_rolling_2year', 'point_diff_rolling_2year', 'win_pct_rolling_2year']
categorical_features = ['conference', 'side_of_ball', 'position_group', 'position', 'year']

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

In [5]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [38]:
accuracy = round(accuracy_score(y_test, y_pred), 2)
print(f"accuracy score: {accuracy}")

accuracy score: 0.7


In [36]:
f1 = round(float(f1_score(y_test, y_pred, average='weighted')), 2)
print(f"f1_score: {f1}")

f1_score: 0.76


In [8]:
confusion_matrix(y_test, y_pred)

array([[3839, 1653],
       [ 165,  328]])

In [9]:
onehot_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
processed_features = numerical_features + list(onehot_features)

In [10]:
coefficients = pd.DataFrame({'feature': processed_features,
                              'value': pipe.named_steps['classifier'].coef_[0]}).sort_values('value', ascending=False)

In [11]:
coefficients.head()

Unnamed: 0,feature,value
0,rating,0.884891
1,ranking,0.427632
14,conference_Big Ten,0.370391
20,conference_SEC,0.28114
54,position_WDE,0.262626


In [12]:
chart = alt.Chart(coefficients).mark_bar().encode(
    x=alt.X('value', sort=alt.EncodingSortField(field='value', order='descending')),
    y=alt.Y('feature', sort=None)
)
chart