# Tour & Travels Customer Churn Prediction
### https://www.kaggle.com/datasets/tejashvi14/tour-travels-customer-churn-prediction

In [None]:
import pandas as pd
import numpy as np
import itertools

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import learning_curve, validation_curve, train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_validate, RepeatedStratifiedKFold
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier

from scipy.stats import loguniform, beta, uniform

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline as IMBPipeline

import missingno as msno # credo non necessario

import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

import scipy.stats as stats # aggiunta
from scipy.stats import binom # da vrf


In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
# per eventuale verifica delle versioni delle librerie
# ESEMPIO: print(pd.__version__, np.__version__, ...)

In [None]:
db = pd.read_csv('Customertravel.csv')

In [None]:
#db #solo per check

In [None]:
db.info()

In [None]:
# sembrerebbe che non ci siano missing value, ma...

In [None]:
db.describe(include='all')

In [None]:
#... da descibe si nota che l'attributo "FrequentFlyer" che dovrebbe essere binario del tipo Yes/No, in realtà presenta
# 3 distinte valorizzaioni ==> esiste un terzo valore = "No Record" ATTENZIONE

In [None]:
db['FrequentFlyer'].value_counts()

In [None]:
# facciamo innanzitutto lo split X, y

In [None]:
X, y = np.split(db,[-1],axis=1)

In [None]:
X # solo per check

In [None]:
X.shape

In [None]:
y

In [None]:
y.shape

In [None]:
X.info()

## -------------------------

## --------------------------

### COSTRUZIONE DELLA CLASSE TRANSFORM (non considerare per il momento questa classe!!!!)

In [None]:
class NoRecordTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
      
    def transform(self, X, y=None):
        
        freq_Yes = X.value_counts()['Yes']
        freq_YesNo = X.value_counts()['Yes'] + X.value_counts()['No']
        prob_Yes = freq_Yes/freq_YesNo
        freq_NoRecord = X.value_counts()['No Record'] 
        
        fair_coin_flips = stats.binom.rvs(n=1,                  # Number of flips per trial
                                          p=prob_Yes,           # Success probability
                                          size=freq_NoRecord,   # Number of trials
                                          random_state=100)  
        
        NoRecord_impute = np.empty(freq_NoRecord, dtype=object)
        for i, value in enumerate(fair_coin_flips):
            if value == 0:
                NoRecord_impute[i] = 'No'
        else:
                NoRecord_impute[i] = 'Yes'
        
        X_imputed = np.zeros(len(X), dtype=object)
        k = -1
        for i in range(len(X)):
            if X[i] == 'No Record':
                k += 1
                X_imputed[i] = NoRecord_impute[k]
            else:
                X_imputed[i] = X[i]
        return pd.Series(X_imputed)
    
    def get_feature_names_out(self, input_features=None):
        return ['FrequentFlyer']

### COSTRUZIONE DELLA PAPELINE DI TRASFORMAZIONI

In [None]:
#pipeline_FrequentFlyer = Pipeline([
#    ('custom', NoRecordTransformer()),
#    ('hot', OneHotEncoder(categories='auto',drop='first',handle_unknown='ignore'))
#])


pipeline_FrequentFlyer = Pipeline([
   ('hot', OneHotEncoder(categories='auto', drop='first', handle_unknown='ignore'))
])



final_transformation = ColumnTransformer(transformers=[
    ('Age', StandardScaler(), ['Age']),
    ('FrequentFlyer', pipeline_FrequentFlyer, ['FrequentFlyer']),
    ('AnnualIncomeClass', OrdinalEncoder(categories=[['Low Income','Middle Income','High Income']]), 
         ['AnnualIncomeClass']),    
    ('ServicesOpted', MinMaxScaler(), ['ServicesOpted']),
    ('AccountSyncedToSocialMedia', OneHotEncoder(categories='auto',drop='first',handle_unknown='ignore'), 
         ['AccountSyncedToSocialMedia']),
    ('BookedHotelOrNot', OneHotEncoder(categories='auto',drop='first',handle_unknown='ignore'), 
         ['BookedHotelOrNot'])   
],
remainder='drop',
verbose_feature_names_out=False,
sparse_threshold=0
)

### TRAINING AND TEST SETS

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=10, shuffle=True)

### MODEL SELECTION

In [None]:
model_pipeline = IMBPipeline([
    ('trans', final_transformation),
    ('sampler', SMOTE()),
    ('dim_reduction', PCA(n_components=0.8)),
    ('classifier', Perceptron())
])

In [None]:
model_pipeline.fit(X_train,y_train)