In [2]:
from sklearn.linear_model import LogisticRegressionCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Lasso Regression

why Lasso? Penalize terms, inlcuding variables with high multicollinearity which exist in our dataset. It also performs variable selection all together

1. We use lasso + logistic
2. We use pure lasso

In [3]:
# Load the dataset

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Separate the features and the target variable

X_train = train.drop(columns=['ID', 'Customer_Churn'])
y_train = train['Customer_Churn']
X_val = test.drop(columns=['ID', 'Customer_Churn']) 
y_val = test['Customer_Churn']


# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Create a logistic regression model with Lasso (L1) regularization
# Using LogisticRegressionCV to find the best regularization strength via cross-validation

lasso_model = LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear', max_iter=100, random_state=42)

# Fit the model to the training data
lasso_model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_pred = lasso_model.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred)

# Evaluate the model
print(classification_report(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print(accuracy)

              precision    recall  f1-score   support

           0       0.59      0.87      0.71       714
           1       0.55      0.21      0.31       536

    accuracy                           0.59      1250
   macro avg       0.57      0.54      0.51      1250
weighted avg       0.57      0.59      0.53      1250

Confusion Matrix:
[[619  95]
 [422 114]]
0.5864


In [4]:
# Load the dataset

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Separate the features and the target variable

X_train = train.drop(columns=['ID', 'Customer_Churn'])
y_train = train['Customer_Churn']
X_val = test.drop(columns=['ID', 'Customer_Churn']) 
y_val = test['Customer_Churn']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Create a LassoCV model instance which will perform cross-validation to find the optimal lambda (alpha)
lasso_cv = LassoCV(cv=5, random_state=42)

# Fit the model to the scaled training data
lasso_cv.fit(X_train_scaled, y_train)

# Get the indices of the selected features (non-zero coefficients)
selected_features_indices = np.where(lasso_cv.coef_ != 0)[0]

# Get the names of the selected features
selected_features_names = X_train.columns[selected_features_indices]

# Predict on the validation set using the model with the optimal lambda
y_pred = lasso_cv.predict(X_val_scaled) > 0.5  # using 0.5 as the threshold for binary classification

# Calculate the accuracy on the validation set
accuracy = accuracy_score(y_val, y_pred)

selected_features_names, accuracy

(Index(['AI_Interaction_Level', 'Satisfaction_with_AI_Services',
        'AI_Response_Time', 'Overall_Usage_Frequency',
        'Change_in_Usage_Patterns'],
       dtype='object'),
 0.5848)

## Elastic Net (in between Lasso and Ridge regression)

In [5]:
from sklearn.linear_model import ElasticNetCV, ElasticNet

# Load the dataset

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Separate the features and the target variable

X_train = train.drop(columns=['ID', 'Customer_Churn'])
y_train = train['Customer_Churn']
X_val = test.drop(columns=['ID', 'Customer_Churn']) 
y_val = test['Customer_Churn']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


elastic_net_cv = ElasticNetCV(cv=5, random_state=42, l1_ratio=[.1, .5, .7, .9, .95, .99, 1])
elastic_net_cv.fit(X_train_scaled, y_train)

# The best alpha value found during cross-validation
best_alpha = elastic_net_cv.alpha_
# The best l1_ratio found during cross-validation
best_l1_ratio = elastic_net_cv.l1_ratio_

best_alpha, best_l1_ratio

# Now that we have the optimal alpha and l1_ratio, we can build the ElasticNet model
elastic_net_model = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, random_state=42)

# Fit the model to the training data
elastic_net_model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_pred_en = elastic_net_model.predict(X_val_scaled) > 0.5  # using 0.5 as the threshold for binary classification

# Calculate the accuracy of the model on the validation set
accuracy_en = accuracy_score(y_val, y_pred_en)

# Output the chosen alpha, l1_ratio, and accuracy of the model
best_alpha, best_l1_ratio, accuracy_en


(0.004346232624995418, 1.0, 0.5848)

#### apparently the best l1 ratio is 1, so this result is the same as doing full lasso earlier above