In [None]:
import sys
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Function to apply Min-Max Scaling

def apply_scale(dataframe):
    scaler = MinMaxScaler().fit(dataframe)
    return scaler.transform(dataframe)

In [None]:
# Function to select best models using Stratified K-Fold Cross-Validation

def select_best_models(data, model):
    skf = StratifiedKFold(n_splits = 5, random_state = 99, shuffle = True)
    X = data.drop('churn_value', axis = 1)
    y = data['churn_value'].copy()

    train_results,val_results = [],[]
    # perform the cross-validation
    for train_index, val_index in skf.split(X,y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # scale the data
        X_train = apply_scale(X_train)
        X_val = apply_scale(X_val)

        # apply model
        model.fit(X_train, y_train)
        predictions_train = model.predict(X_train)
        predictions_val = model.predict(X_val)

        train_results.append((y_train, predictions_train))
        val_results.append((y_val, predictions_val))
    return train_results, val_results

def apply_score(train_results, val_results, score):
    score_train, score_val = [],[]

    for reference, prediction in train_results:
        score_train.append(score(reference, prediction))

    for reference, prediction in val_results:
        score_val.append(score(reference, prediction))
    # print(score_train)
    # print(score_val)
    avg_train = round(np.mean(score_train),3)
    avg_val = round(np.mean(score_val),3)
    std_train = round(np.std(score_train),4)
    std_val = round(np.std(score_val),4)

    return avg_train, std_train, avg_val, std_val

def show_results(df, data, score,*args):
    count = 0
    # for each instance of model passed as argument
    for arg in args:
        train_results, val_results  = select_best_models(data, arg)
        avg_train, std_train, avg_val, std_val = apply_score(train_results, val_results, score)#)#)
        # store the results in the right row
        df.iloc[count] = str(avg_train) + '+/-' + str(std_train), str(avg_val) + '+/-' + str(std_val)
        count+=1
    return df

| Feature                           | Decision Tree | RFE-8   | RFE-10  | RFE-15  | Chi-Squared |
| --------------------------------- | ------------- | ------- | ------- | ------- | ----------- |
| number_of_referrals               | **YES**       | **YES** | **YES** | **YES** | **YES**     |
| offer                             | **YES**       | NO      | NO      | NO      | **YES**     |
| phone_service                     | NO            | NO      | **YES** | **YES** | NO          |
| avg_monthly_long_distance_charges | **YES**       | NO      | NO      | NO      | NO          |
| multiple_lines                    | NO            | NO      | NO      | NO      | **YES**     |
| internet_service                  | NO            | NO      | NO      | NO      | **YES**     |
| internet_type                     | NO            | NO      | NO      | NO      | **YES**     |
| avg_monthly_gb_download           | **YES**       | NO      | NO      | NO      | **YES**     |
| online_security                   | NO            | NO      | NO      | **YES** | **YES**     |
| online_backup                     | NO            | NO      | NO      | **YES** | **YES**     |
| device_protection_plan            | NO            | NO      | NO      | NO      | **YES**     |
| premium_tech_support              | **YES**       | NO      | NO      | **YES** | **YES**     |
| streaming_tv                      | NO            | NO      | NO      | NO      | **YES**     |
| streaming_movies                  | NO            | NO      | NO      | NO      | **YES**     |
| streaming_music                   | **YES**       | NO      | NO      | NO      | **YES**     |
| unlimited_data                    | NO            | NO      | NO      | NO      | **YES**     |
| contract                          | **YES**       | **YES** | **YES** | **YES** | **YES**     |
| paperless_billing                 | NO            | NO      | NO      | NO      | **YES**     |
| payment_method_Bank Withdrawal    | NO            | NO      | NO      | NO      | **YES**     |
| payment_method_Credit Card        | **YES**       | NO      | NO      | **YES** | **YES**     |
| payment_method_Mailed Check       | NO            | **YES** | **YES** | **YES** | **YES**     |
| monthly_charge                    | **YES**       | **YES** | **YES** | **YES** | **YES**     |
| total_charges                     | **YES**       | **YES** | **YES** | **YES** | NO          |
| total_refunds                     | NO            | **YES** | **YES** | **YES** | **YES**     |
| total_extra_data_charges          | NO            | NO      | NO      | NO      | **YES**     |
| total_revenue                     | **YES**       | NO      | NO      | **YES** | **YES**     |
| gender                            | NO            | NO      | NO      | NO      | NO          |
| age                               | **YES**       | **YES** | **YES** | **YES** | **YES**     |
| number_of_dependents              | **YES**       | **YES** | **YES** | **YES** | **YES**     |
| cltv                              | **YES**       | NO      | NO      | NO      | NO          |
| population                        | **YES**       | NO      | **YES** | **YES** | **YES**     |


In [None]:
# Creating 3 perspectives for feature selection, one where we take features deemed releveant by all selection methods, one with where we get atleast 4 and one where we get atleast 3

def apply_feature_selection(df, perspective=1):
    selectd_columns = []
    if 'churn_value' in df.columns:
        selectd_columns.append('churn_value')

    if perspective==1:
        selectd_columns += ["number_of_referrals","contract","monthly_charge","age","number_of_dependents"]
    elif perspective==2:
        selectd_columns += ["number_of_referrals","offer","online_security","online_backup","premium_tech_support","contract","payment_method_Mailed Check","monthly_charge","total_charges","total_refunds","total_revenue","age","number_of_dependents","population"]
    elif perspective==3:
        selectd_columns += ["number_of_referrals","offer","phone_service","avg_monthly_long_distance_charges","avg_monthly_gb_download","online_security","online_backup","premium_tech_support","streaming_music","contract","payment_method_Credit Card","payment_method_Mailed Check","monthly_charge","total_charges","total_refunds","total_revenue","age","number_of_dependents","population"]
    return df[selectd_columns]