# Prediction

In [None]:
import pandas as pd

## Load df from CSV

In [None]:
df = pd.read_csv('./dataset.csv')

# Also prepare for the prediction (see transform)
df = df.drop(columns='customerID')
df = df.dropna(how='any')
df['gender'] = df['gender'].map({'Male' : 0, 'Female' : 1})
df['Churn'] = df['Churn'].map({'Yes' : 1, 'No' : 0})
bin_features = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for bin_feature in bin_features:
    df[bin_feature] = df[bin_feature].map({'Yes' : 1, 'No' : 0})
bin_features = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
for bin_feature in bin_features:
    df[bin_feature] = df[bin_feature].map({'Yes' : 1, 'No' : 0, 'No internet service' : 0, 'No phone service' : 0})
cat_features = ['InternetService', 'Contract', 'PaymentMethod']
for cat_feature in cat_features:
    df = pd.concat([df, pd.get_dummies(df[cat_feature])], axis=1).drop(columns=cat_feature)
df.loc[df['TotalCharges'] == ' ', 'TotalCharges'] = '0.0'

df.head()

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X, y = df.drop(columns='Churn'), df['Churn']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)   # stratify is optional

## Model Fit-Predict

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000000)

lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)

## Accuracy, F1 and Confusion Matrix

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Yes', 'No'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()

print(f"""Logistic Regression: 
Accuracy:           {accuracy_score(y_test, lr_pred)} 
F1-score:           {f1_score(y_test, lr_pred)} 
Confusion Matrix:""")
plot_confusion_matrix(confusion_matrix(y_test, lr_pred))

## KFold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression()
cross_val_score(model, df.drop(columns='Churn'), df['Churn'], cv=10, scoring='accuracy').mean()

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

X, y = df.drop(columns='Churn'), df['Churn']

# !! IMPORTANT !!
# The pipeline never applies preprocessing operations to labels passed in the fit (y_train), so the only dataframe it manipulates is X, 
# which doesn't contains the label column. This is the reason why in ColumnTransformer we must act over X columns, not on df ones
# which include also the label column.
# Note: if you need to transform the label column, you have to do this separately because bot ColumnTransformer and Pipeline never touch it

# We use ColumnTransformer only if we want to apply some transformation to a column SUBSET
col_trans = ColumnTransformer(transformers=[
    ('text_to_class', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), X.select_dtypes(include=['object']).columns)
], remainder='passthrough')

# Transformation that needs to be applied to all columns must be placed in Pipeline
pipeline = Pipeline(steps=[
    ('remove_na', SimpleImputer(strategy='most_frequent').set_output(transform='pandas')),
    ('preprocessor', col_trans),
    ('scale', StandardScaler()),
    ('model', KNeighborsClassifier())
])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
pipeline.fit(x_train, y_train)
accuracy_score(y_test, pipeline.predict(x_test))