In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import seaborn
from gower import gower_matrix
import numpy as np
import os

pd.set_option('future.no_silent_downcasting', True)


def encodeYesNo(df, cols: [str]):
    df[cols] = df[cols].replace(to_replace="Yes", value=1.0)
    df[cols] = df[cols].replace(to_replace="No", value=0.0)


def encodeOneHot(df, col: str, prefix: str = None):
    df = pd.concat([df, pd.get_dummies(df[col], dtype=float, prefix=prefix)], axis=1)
    df = df.drop(col, axis=1)
    return df


def groupByQuartile(df, col: str):
    q1 = df[col].quantile(0.25)
    q2 = df[col].quantile(0.5)
    q3 = df[col].quantile(0.75)
    df[col] = df[col].apply(lambda x: 0 if x < q1 else 1 if x < q2 else 2 if x < q3 else 3)
    return df

def contractToYears(contract):
    length = 0
    if contract == "One year":
        length = 1
    elif contract == "Two year":
        length = 2
    return length

class BinaryPredictor:
    def __init__(self, df: pd.DataFrame, target: str, random_state=0, test_size=0.2):
        self.data = df
        self.target = target
        self.random_state = random_state
        self.test_size = test_size
        df_train, df_test = train_test_split(df, random_state=random_state, test_size=test_size)
        self.y_train = df_train[target].astype(int)
        self.X_train = df_train.drop(target, axis=1)
        self.y_test = df_test[target].astype(int)
        self.X_test = df_test.drop(target, axis=1)

    def fit_knn_gower(self, k=5):
        gower_m = None
        y = self.data[self.target].to_numpy()
        X = self.data.drop(self.target, axis=1)
        if (os.path.isfile('gower.csv')):
            gower_m = np.loadtxt('gower.csv', delimiter=',').astype(float)
        else:
            gower_m = gower_matrix(X)
            np.savetxt('gower.csv', gower_m, delimiter=',')

        self.X_train_g, self.X_test_g, self.y_train_g, self.y_test_g = train_test_split(gower_m, y,\
            random_state=self.random_state, test_size=self.test_size)
        self.knn_g = KNeighborsClassifier(n_neighbors=k).fit(self.X_train_g, self.y_train_g.astype(float))

    def print_knn_gower_prediction(self):
        y_pred = self.knn_g.predict(self.X_test_g)
        print("\nK nearest neighbors model with Gower distance:\n")
        print(classification_report(self.y_test_g.astype(int), y_pred.astype(int)))

    def fit_logistic_regression(self, solver='lbfgs'):
        self.lrm = LogisticRegression(solver=solver).fit(self.X_train, self.y_train)
  
    def fit_knn(self, k=5):
        self.knn = KNeighborsClassifier(n_neighbors=k).fit(self.X_train, self.y_train)

    def fit_random_forest(self, max_depth=2, random_state=0):
        self.rfc = RandomForestClassifier(max_depth=max_depth, random_state=random_state).fit(self.X_train, self.y_train)

    def print_logistic_regression_prediction(self):
        y_pred = self.lrm.predict(self.X_test)
        print("\nBinomial logistic regression model:\n")
        print(classification_report(self.y_test, y_pred))
    
    def print_knn_prediction(self):
        y_pred = self.knn.predict(self.X_test)
        print("\nK Nearest Neighbors model:\n")
        print(classification_report(self.y_test, y_pred))

    def print_random_forest_prediction(self):
        y_pred = self.rfc.predict(self.X_test)
        print("\nRandom forest model:\n")
        print(classification_report(self.y_test, y_pred))

#    def find_knn_neighbors(self, min=10, max=30):
#        score = 0
#        new_score = 0.001
#        k = min
#        while new_score > score:


def main():
    df = pd.read_excel('/home/arren/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.xlsx')
    df = df.dropna(axis=0)

    #df['HighMonthlyLowTenure'] = df['MonthlyCharges']/df['tenure']
    #df = groupByQuartile(df, 'HighMonthlyLowTenure')
    encodeYesNo(df, ['Churn'])
    df['tenure'] = df['tenure'].apply(lambda x: x//12)
    #df['StabilityFactor'] = df['tenure'] + df['Contract'].apply(contractToYears)
    #df = df.drop('tenure', axis=1) # we're including tenure in StabilityFactor aggregate features
    #df = df.drop('Contract', axis=1) # ""

    df = groupByQuartile(df, 'MonthlyCharges')

    df = df.drop('customerID', axis=1) # customer ID isn't relevant for the model
    df = df.drop('TotalCharges', axis=1) # Total charges is just monthly charge * tenure
    df = df.drop('gender', axis=1) # on examining the correlations with Churn, male/female are insignificant (~0.0086)

    df['MultipleLines'] = df['MultipleLines'].replace(to_replace="No phone service", value=0.0)
    encodeYesNo(df, ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'PaperlessBilling'])

    df = encodeOneHot(df, 'InternetService', 'Internet')

    internet_services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'] 
    df[internet_services] = df[internet_services].replace(to_replace="No internet service", value=0.0)
    encodeYesNo(df, internet_services)

    df['NumServices'] = df[internet_services].sum(axis=1)
    df = df.drop(internet_services, axis=1)

    df = encodeOneHot(df, 'Contract')

    df = encodeOneHot(df, 'PaymentMethod', 'PayMethod')
    #seaborn.relplot(data=df, y="tenure", x="MonthlyCharges", hue="Churn")
    #seaborn.barplot(data=corrs)
    #seaborn.relplot(y=df['HighMonthlyLowTenure'], x=df.index, hue=df['Churn'])
    #seaborn.pairplot(df)
    df = df.drop('Internet_Fiber optic', axis=1) # this variable correlates strongly with churn, but it also correlates with high monthly charges
    df = df.drop('Internet_No', axis=1) # this correlates strongly negatively with high monthly charges
    #df = df.drop('MonthlyCharges', axis=1) # included in HighMonthlyLowTenure
    low_corr_cols = ['PayMethod_Credit card (automatic)', 'Internet_DSL', 'PayMethod_Bank transfer (automatic)',\
        'PayMethod_Mailed check', 'PhoneService', 'MultipleLines'] # cols with less than .15 correlation with churn
    df = df.drop(low_corr_cols, axis=1)


    corrs = df.corrwith(df['Churn']).sort_values()
    print(corrs)
    #seaborn.heatmap(data=df.corr().abs())
    bp = BinaryPredictor(df, target='Churn')
    bp.fit_logistic_regression(solver='lbfgs')
    bp.fit_knn(k=25)
    bp.fit_random_forest(max_depth=4)
    bp.print_logistic_regression_prediction()
    bp.print_knn_prediction()
    bp.print_random_forest_prediction()

    #bp.fit_logistic_regression(solver='newton-cholesky')
    #bp.print_logistic_regression_prediction()
    bp.fit_knn_gower(k=25)
    bp.print_knn_gower_prediction()

main()

tenure                       -0.342196
Two year                     -0.302253
One year                     -0.177820
Dependents                   -0.164221
Partner                      -0.150448
NumServices                  -0.087698
SeniorCitizen                 0.150889
PaperlessBilling              0.191825
MonthlyCharges                0.195872
PayMethod_Electronic check    0.301919
Month-to-month                0.405103
Churn                         1.000000
dtype: float64

Binomial logistic regression model:

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1041
           1       0.62      0.48      0.54       368

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.70      1409
weighted avg       0.78      0.79      0.78      1409


K Nearest Neighbors model:

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      1041
           1  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_california_housing
import seaborn
from gower import gower_matrix
import numpy as np
import os


class BinaryPredictor:
    def __init__(self, df: pd.DataFrame, target: str, random_state=0, test_size=0.2):
        self.data = df
        self.target = target
        self.random_state = random_state
        self.test_size = test_size
        df_train, df_test = train_test_split(df, random_state=random_state, test_size=test_size)
        self.y_train = df_train[target].astype(int)
        self.X_train = df_train.drop(target, axis=1)
        self.y_test = df_test[target].astype(int)
        self.X_test = df_test.drop(target, axis=1)

    def fit_knn_gower(self, k=5):
        gower_m = None
        y = self.data[self.target].to_numpy()
        X = self.data.drop(self.target, axis=1)
        if (os.path.isfile('gower.csv')):
            gower_m = np.loadtxt('gower.csv', delimiter=',').astype(float)
        else:
            gower_m = gower_matrix(X)
            np.savetxt('gower.csv', gower_m, delimiter=',')

        self.X_train_g, self.X_test_g, self.y_train_g, self.y_test_g = train_test_split(gower_m, y,\
            random_state=self.random_state, test_size=self.test_size)
        self.knn_g = KNeighborsClassifier(n_neighbors=k).fit(self.X_train_g, self.y_train_g.astype(float))

    def print_knn_gower_prediction(self):
        y_pred = self.knn_g.predict(self.X_test_g)
        print("\nK nearest neighbors model with Gower distance:\n")
        print(classification_report(self.y_test_g.astype(int), y_pred.astype(int)))

    def fit_logistic_regression(self, solver='lbfgs'):
        self.lrm = LogisticRegression(solver=solver).fit(self.X_train, self.y_train)
  
    def fit_knn(self, k=5):
        self.knn = KNeighborsClassifier(n_neighbors=k).fit(self.X_train, self.y_train)

    def fit_random_forest(self, max_depth=2, random_state=0):
        self.rfc = RandomForestClassifier(max_depth=max_depth, random_state=random_state).fit(self.X_train, self.y_train)

    def print_logistic_regression_prediction(self):
        y_pred = self.lrm.predict(self.X_test)
        print("\nBinomial logistic regression model:\n")
        print(classification_report(self.y_test, y_pred))
    
    def print_knn_prediction(self):
        y_pred = self.knn.predict(self.X_test)
        print("\nK Nearest Neighbors model:\n")
        print(classification_report(self.y_test, y_pred))

    def print_random_forest_prediction(self):
        y_pred = self.rfc.predict(self.X_test)
        print("\nRandom forest model:\n")
        print(classification_report(self.y_test, y_pred))

def main():
    df = fetch_california_housing(as_frame=True)
    print(type(df))

main()

<class 'sklearn.utils._bunch.Bunch'>
