# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score

In [2]:
data = pd.read_csv("../pca_result.csv")

In [3]:
X = data.drop(["Popularity"], axis=1)
y = data["Popularity"]

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

minmax = MinMaxScaler()
X_minmax = minmax.fit_transform(X)

# Using the simplest model

## Using the X as it is

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [7]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[42 39]
 [45 39]]


In [8]:
f1_score_value = metrics.f1_score(y_test, y_pred)
print('F1 score: {:.2f}'.format(f1_score_value))

F1 score: 0.48


In [9]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.49
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.52      0.50        81
           1       0.50      0.46      0.48        84

    accuracy                           0.49       165
   macro avg       0.49      0.49      0.49       165
weighted avg       0.49      0.49      0.49       165



## Using scaled data

### Standard scaling

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[42 39]
 [44 40]]


In [13]:
f1_score_value = metrics.f1_score(y_test, y_pred)
print('F1 score: {:.2f}'.format(f1_score_value))

F1 score: 0.49


In [14]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.50
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.50        81
           1       0.51      0.48      0.49        84

    accuracy                           0.50       165
   macro avg       0.50      0.50      0.50       165
weighted avg       0.50      0.50      0.50       165



### MinMax scaling

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.2, random_state=42)

In [16]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [17]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[42 39]
 [48 36]]


In [18]:
f1_score_value = metrics.f1_score(y_test, y_pred)
print('F1 score: {:.2f}'.format(f1_score_value))

F1 score: 0.45


In [19]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.52      0.49        81
           1       0.48      0.43      0.45        84

    accuracy                           0.47       165
   macro avg       0.47      0.47      0.47       165
weighted avg       0.47      0.47      0.47       165



# Using Bagging

## Using the X as it is

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
base_model = LogisticRegression(max_iter=1000)

bagging_model = BaggingClassifier(estimator=base_model, n_estimators=10, random_state=42)

bagging_model.fit(X_train, y_train)

y_pred = bagging_model.predict(X_test)

In [23]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[42 39]
 [44 40]]


In [24]:
f1_score_value = metrics.f1_score(y_test, y_pred)
print('F1 score: {:.2f}'.format(f1_score_value))

F1 score: 0.49


In [25]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.50
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.50        81
           1       0.51      0.48      0.49        84

    accuracy                           0.50       165
   macro avg       0.50      0.50      0.50       165
weighted avg       0.50      0.50      0.50       165



## Using scaled data

### Standard scaling

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [28]:
base_model = LogisticRegression(max_iter=1000)

bagging_model = BaggingClassifier(estimator=base_model, n_estimators=10, random_state=42)

bagging_model.fit(X_train, y_train)

y_pred = bagging_model.predict(X_test)

In [29]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[42 39]
 [44 40]]


In [30]:
f1_score_value = metrics.f1_score(y_test, y_pred)
print('F1 score: {:.2f}'.format(f1_score_value))

F1 score: 0.49


In [31]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.50
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.52      0.50        81
           1       0.51      0.48      0.49        84

    accuracy                           0.50       165
   macro avg       0.50      0.50      0.50       165
weighted avg       0.50      0.50      0.50       165



### MinMax scaling

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.2, random_state=42)

In [33]:
base_model = LogisticRegression(max_iter=1000)

bagging_model = BaggingClassifier(estimator=base_model, n_estimators=10, random_state=42)

bagging_model.fit(X_train, y_train)

y_pred = bagging_model.predict(X_test)

In [34]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[40 41]
 [45 39]]


In [35]:
f1_score_value = metrics.f1_score(y_test, y_pred)
print('F1 score: {:.2f}'.format(f1_score_value))

F1 score: 0.48


In [36]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.48
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.49      0.48        81
           1       0.49      0.46      0.48        84

    accuracy                           0.48       165
   macro avg       0.48      0.48      0.48       165
weighted avg       0.48      0.48      0.48       165



As we see, different approaches as Scalling the data and using Bagging in modeling  .. Desn't seem to make much different.
So, to keep it Simple .. we may Scale the data by Standard scaling , and use the simplist form of the model.

And the best Accuracy we've got is 50%
with f1 score = 0.50