In [2]:
from feast import FeatureStore
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from joblib import dump
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Getting our FeatureStore
store = FeatureStore(repo_path=".")

In [4]:
# Retrieving the saved dataset and converting it to a DataFrame
training_df = store.get_saved_dataset(name="churn_dataset").to_df()



In [5]:
training_df

Unnamed: 0,Gender,HasCrCard,IsActiveMember,event_timestamp,CreditScore,NumOfProducts,Geography_Germany,Geography_Spain,Tenure,Balance,EstimatedSalary,Exited,Geography_France,USER_ID,Age
0,1,1,0,2023-01-01 00:00:00+00:00,631,2,False,True,7,0.00,181605.85,0,False,7670,38
1,0,0,1,2023-01-01 00:00:00+00:00,616,1,True,False,7,95984.21,115262.54,1,False,9729,43
2,1,1,1,2023-01-01 00:00:00+00:00,642,2,False,False,1,160541.00,142223.94,0,True,5378,72
3,1,1,0,2023-01-01 00:00:00+00:00,634,1,True,False,5,108891.70,10078.02,0,False,7693,74
4,0,1,0,2023-01-01 00:00:00+00:00,617,3,True,False,5,83348.89,7953.62,1,False,6496,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,1,2023-12-31 00:00:00+00:00,707,2,False,False,1,0.00,54409.48,0,True,2871,55
9996,0,1,0,2023-12-31 00:00:00+00:00,850,1,True,False,5,114491.82,99689.48,0,False,4667,39
9997,0,1,1,2023-12-31 00:00:00+00:00,732,2,True,False,9,94867.18,157527.60,1,False,2782,61
9998,1,0,1,2023-12-31 00:00:00+00:00,581,2,False,False,4,0.00,86383.82,0,True,7909,35


In [6]:
# Separating the features and labels
target = training_df['Exited']
features = training_df.drop(
    labels=['Exited', 'event_timestamp', 'USER_ID'],
    axis=1)

In [7]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    target,
                                                    stratify=target)

In [8]:
X_train.columns

Index(['Gender', 'HasCrCard', 'IsActiveMember', 'CreditScore', 'NumOfProducts',
       'Geography_Germany', 'Geography_Spain', 'Tenure', 'Balance',
       'EstimatedSalary', 'Geography_France', 'Age'],
      dtype='object')

In [25]:
X_train = X_train[sorted(X_train.columns)]
X_test = X_test[sorted(X_train.columns)]

### Train - Logistic Regression Classifier

In [26]:
# Define the hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 300]
}

In [27]:
# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to your data
grid_search.fit(X=X_train, y=y_train)

  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
  alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
  ret = line_search_wolfe2(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

In [28]:
# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters: ", best_params)
print("Best Estimator: ", best_estimator)

Best Parameters:  {'C': 10, 'max_iter': 300, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Estimator:  LogisticRegression(C=10, max_iter=300)


In [29]:
# Make predictions and evaluate your model
y_pred_test = best_estimator.predict(X_test)
y_pred_train = best_estimator.predict(X_train)

In [30]:
accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy: ", accuracy)
print("Test Set Performance:")
print(classification_report(y_test, y_pred_test))

Test Accuracy:  0.8128
Test Set Performance:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1991
           1       0.61      0.23      0.33       509

    accuracy                           0.81      2500
   macro avg       0.72      0.59      0.61      2500
weighted avg       0.78      0.81      0.78      2500



In [31]:
print("Train Set Performance:")
print(classification_report(y_train, y_pred_train))

Train Set Performance:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      5972
           1       0.62      0.22      0.32      1528

    accuracy                           0.81      7500
   macro avg       0.73      0.59      0.61      7500
weighted avg       0.79      0.81      0.78      7500



In [32]:
# Saving the model
dump(value=grid_search, filename=r"models/logistic_regression.joblib")

['models/logistic_regression.joblib']