# Logistic Regression - Can it beat the baseline model?

In [None]:
# import data

import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/cleaned_data.csv')
df.head()
#standr scaling for lg reg

In [None]:
# Defining target and predictors 
X = df[['slug', 'launched_at_weekday', 'launched_at_month', 'duration_days', 'goal_in_usd', 'north_america']]
y = df['state']

# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Baseline Model

In [None]:
# Modelling
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)

In [None]:
# Confusion matrix baseline
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True)
plt.title("Baseline Model", fontsize =20)

print('Accuracy: ', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Data Scaling with Standardization

In [None]:
# Scaling with standard scaler

# Define columns that should be scaled
col_scale = ['slug', 'launched_at_weekday', 'launched_at_month', 'north_america']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[col_scale])
X_test_scaled = scaler.transform(X_test[col_scale])

# Concatenating scaled and dummy columns 
X_train_preprocessed = np.concatenate([X_train_scaled, X_train.drop(col_scale, axis=1)], axis=1)
X_test_preprocessed = np.concatenate([X_test_scaled, X_test.drop(col_scale, axis=1)], axis=1)

In [None]:
# Modelling scaled data
logistic_regression_scaled = LogisticRegression()
logistic_regression_scaled.fit(X_train_preprocessed, y_train)
y_pred_scaled = logistic_regression_scaled.predict(X_test_preprocessed)

In [None]:
# Confusion matrix for scaled data
conf_matrix_scaled = confusion_matrix(y_test, y_pred_scaled)
sns.heatmap(conf_matrix_scaled, annot=True)
plt.title("Scaled Data", fontsize =20)

print('Accuracy: ', accuracy_score(y_test, y_pred_scaled))
print(classification_report(y_test, y_pred_scaled))

## Data scaled and Gridsearched for Maximum Wow

In [None]:
#what parameters does Logistic Regression have?
logistic_regression_scaled.get_params().keys()

#precision

