In [1]:
# Import all the needed packages

from sklearn.model_selection import StratifiedKFold
import time
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.metrics import f1_score
from matplotlib.legend_handler import HandlerLine2D
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
## Import the dataset

data = pd.read_csv('train.csv')
data.set_index('Guest_ID', inplace = True)

In [3]:
## FILL NA

data2=data.copy()
data2.dropna(axis=0, inplace=True)
data2['Age']=data2['Date_Birth'].apply(lambda x: x[-4:])
data2['Age']=2021-pd.to_numeric(data2['Age'])
age_mean=data2['Age'].mean()

age_temporary = "2021"
data['Date_Birth'].fillna(age_temporary, inplace = True)

In [4]:
## FEATURE ENG

data['Age']=data['Date_Birth'].apply(lambda x: x[-4:]) 
data['Age']=2021-pd.to_numeric(data['Age'])
data.drop(columns=['Date_Birth'], inplace=True)

## FIX NA
data["Age"].replace({0: age_mean}, inplace=True)

new = data['Name'].str.split(" ", n = 2, expand = True) 
data['Gender']= new[0] 
data.loc[data.Gender=="Mr.","Gender"]="M"
data.loc[data.Gender=="Mrs.","Gender"]="F"
data.loc[data.Gender=="Miss","Gender"]="F"
data['Gender'] = np.where((data['Gender']=='M'),1,0)
data.drop(columns=['Name'], inplace=True)

data['Flight_Class_2'] = data['Flight_Class']
data['Flight_Class_2'].replace('Eco Plus', 'Eco',inplace=True)

data['Points/Age']=data['Points']/data['Age']

data['AVG_Score1']=data[['Room','Check-in/Check-out','F&B','Location','Wifi','Entertainment','Gym','Spa','Staff','Pool','Baggage_Handling','Reception','Cleanliness','Online_Booking']].sum(axis=1)
data['AVG_Score2']=data[['Room','Check-in/Check-out','F&B','Location','Wifi','Entertainment','Cleanliness']].sum(axis=1)
data['AVG_Score3']=data[['Room','F&B','Cleanliness','Location']].sum(axis=1)
data['AVG_Score4']=data[['Wifi','Entertainment','Gym','Spa','Pool']].sum(axis=1)
data['AVG_Score5']=data[['Staff','Baggage_Handling','Reception']].sum(axis=1)
data['AVG_Score6']=data[['Check-in/Check-out','Online_Booking']].sum(axis=1)

data=pd.get_dummies(data)
data.drop(columns=['Type_Personal Travel','Flight_Class_2_Eco','Flight_Class_Eco'], inplace=True)

In [5]:
def compare_models(data, model):
    # apply StratifiedK-Fold
    skf = StratifiedKFold(n_splits = 10)
    score_train = []
    score_test = []
    X = data.drop(columns=['Repeater'])
    y = data['Repeater'].copy()
    score_train = []
    score_val = []
    for train_index, val_index in skf.split(X,y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
               
        # This time we are going to use validation to check overfitting 
        # so we need also to make all the needed changes in the validation
        
        # Apply model
        model.fit(X_train, y_train)
        predictions_train = model.predict(X_train)
        predictions_val = model.predict(X_val)
        score_train.append(f1_score(y_train, predictions_train))
        score_val.append(f1_score(y_val, predictions_val))

        avg_train = round(np.mean(score_train),5)
        avg_val = round(np.mean(score_val),5)
        std_train = round(np.std(score_train),5)
        std_val = round(np.std(score_val),5)

    return str(avg_train) + '+/-' + str(std_train),str(avg_val) + '+/-' + str(std_val)
    
def show_results(df, data, *args):
    """
    Receive an empty dataframe and the different models and call the function avg_score
    """
    count = 0
    # for each model passed as argument
    for arg in args:
        # obtain the results provided by avg_score
        avg_train, avg_test = compare_models(data, arg)
        # store the results in the right row
        df.iloc[count] = avg_train, avg_test
        count+=1
    return df


In [6]:
## BEST MODEL USING CROSS VALIDATION

modelCF = GradientBoostingClassifier(random_state=5,loss='deviance', learning_rate=0.2, n_estimators=900, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=7, min_impurity_decrease=0.0, min_impurity_split=None, init=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

df = pd.DataFrame(columns = ['Train','Validation'], index = ['CF'])
show_results(df, data, modelCF)

Unnamed: 0,Train,Validation
CF,1.0+/-0.0,0.95032+/-0.0062


In [7]:
## CREATE OPTIMAL MODEL USING ALL DATA

X = data.drop(columns=['Repeater'])
y = data['Repeater'].copy()

final_model = GradientBoostingClassifier(random_state=5,loss='deviance', learning_rate=0.2, n_estimators=900, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=7, min_impurity_decrease=0.0, min_impurity_split=None, init=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)

GradientBoostingClassifier(learning_rate=0.2, max_depth=7, n_estimators=900,
                           random_state=5)

In [12]:
## PERFORMANCE OPTIMAL MODEL ON ALL TRAINING DATA

final_model.fit(X, y)
predictions_train = final_model.predict(X)
f1_score(y, predictions_train)

1.0

In [13]:
## IMPORT TEST DATASET

data_test = pd.read_csv('test.csv')
data_test.set_index('Guest_ID', inplace = True)

## FEATURE ENG

data2_test=data_test.copy()
data2_test.dropna(inplace=True)
data2_test['Age']=data2_test['Date_Birth'].apply(lambda x: x[-4:])
data2_test['Age']=2021-pd.to_numeric(data2_test['Age'])
age_mean_test=data2_test['Age'].mean()

age_temporary = "2021"
data_test['Date_Birth'].fillna(age_temporary, inplace = True)
data_test['Age']=data_test['Date_Birth'].apply(lambda x: x[-4:]) 
data_test['Age']=2021-pd.to_numeric(data_test['Age'])
data_test.drop(columns=['Date_Birth'], inplace=True)

## FIX NA
data_test["Age"].replace({0: age_mean_test}, inplace=True)

new = data_test['Name'].str.split(" ", n = 2, expand = True) 
data_test['Gender']= new[0] 
data_test.loc[data_test.Gender=="Mr.","Gender"]="M"
data_test.loc[data_test.Gender=="Mrs.","Gender"]="F"
data_test.loc[data_test.Gender=="Miss","Gender"]="F"
data_test['Gender'] = np.where((data_test['Gender']=='M'),1,0)
data_test.drop(columns=['Name'], inplace=True)

data_test['Flight_Class_2'] = data_test['Flight_Class']
data_test['Flight_Class_2'].replace('Eco Plus', 'Eco',inplace=True)

data_test['Points/Age']=data_test['Points']/data_test['Age']

data_test['AVG_Score1']=data_test[['Room','Check-in/Check-out','F&B','Location','Wifi','Entertainment','Gym','Spa','Staff','Pool','Baggage_Handling','Reception','Cleanliness','Online_Booking']].sum(axis=1)
data_test['AVG_Score2']=data_test[['Room','Check-in/Check-out','F&B','Location','Wifi','Entertainment','Cleanliness']].sum(axis=1)
data_test['AVG_Score3']=data_test[['Room','F&B','Cleanliness','Location']].sum(axis=1)
data_test['AVG_Score4']=data_test[['Wifi','Entertainment','Gym','Spa','Pool']].sum(axis=1)
data_test['AVG_Score5']=data_test[['Staff','Baggage_Handling','Reception']].sum(axis=1)
data_test['AVG_Score6']=data_test[['Check-in/Check-out','Online_Booking']].sum(axis=1)

data_test=pd.get_dummies(data_test)
data_test.drop(columns=['Type_Personal Travel','Flight_Class_2_Eco','Flight_Class_Eco'], inplace=True)

In [14]:
data_test['Repeater'] = final_model.predict(data_test)
final_csv = data_test['Repeater'].copy()
final_csv.to_csv('result.csv')