# Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix

In [None]:
races_df = pd.read_csv('../dataset/races_classification.csv')

In [None]:
columns_to_drop = races_df.drop(columns=['_url', 'name','length', 'climb_total', 'profile', 
                            'startlist_quality', 'position', 'cyclist', 'cyclist_team', 
                            'start_date', 'duration','cyclist_number', 
                            'cyclist_experience_profile','cyclist_experience_length', 
                            'cyclist_experience_climb', 'cyclist_experience', 
                            'avg_rel_position_length', 'avg_rel_position_climb', 
                            'relative_position_sum', 'position_entropy'])

In [None]:
test_data = races_df[races_df['start_date'] >= '2020-01-01']
train_data = races_df.drop(test_data.index)

train_labels = train_data['top_20']
test_labels = test_data['top_20']

test_data = test_data.select_dtypes(include=[np.number])
train_data = train_data.select_dtypes(include=[np.number])

In [None]:

parameters = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga', 'newton-cholesky'],
    'penalty': ['l1', 'l2', None],
    'tol': [0.001, 0.0001, 1e-05],
    'C': [1, 10, 100]
}

In [None]:
# Define the logistic regression model
log_reg = LogisticRegression(max_iter=10000)

# Perform randomized search cross-validation
random_search = RandomizedSearchCV(log_reg, parameters, n_iter=50, cv=5, verbose=1, n_jobs=-1, random_state=42, refit=f1_score) # already stratified!
random_search.fit(train_data, train_labels)

In [None]:
random_search.best_params_