# **Predictive Analytics: Shinkansen Passenger Satisfaction**

## Data Preprocessing

**1. Import necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import regex as re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn import svm
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.simplefilter("ignore")

**2. Load the training and test data separately**

In [None]:
#train data
surveydata_train = pd.read_csv("Surveydata_train.csv")
traveldata_train = pd.read_csv("Traveldata_train.csv")
#test data
surveydata_test = pd.read_csv("Surveydata_test.csv")
traveldata_test = pd.read_csv("Traveldata_test.csv")

**3. Understand the data (check for each of the following in both the train and test dataset)**
<ol>
<li>Check a sample of the data</li>
<li>Use the info() and describe() functions for more information</li>
<li>Look for the presence of null values in the dataset</li>
<li>Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns</li>
</ol>

In [None]:
#A. Check a sample of the data
surveydata_train.sample(5)

In [None]:
#A. Check a sample of the data
traveldata_train.sample(5)

In [None]:
#B. Use the info() and describe() functions for more information
surveydata_train.info()
surveydata_train.describe()

In [None]:
#B. Use the info() and describe() functions for more information
traveldata_train.info()
traveldata_train.describe()

In [None]:
#C. Look for the presence of null values in the dataset
surveydata_train.isnull().values.any()

In [None]:
#C. Look for the presence of null values in the dataset
traveldata_train.isnull().values.any()

In [None]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
surveydata_train.describe().columns.astype(str).str.contains("($|#)").any()

In [None]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
traveldata_train.describe().columns.astype(str).str.contains("($|#)").any()

**4. Clean the data**
<ol>
<li>Treat for missing values in both the train & test set</li>
<li>Remove bad data values in both the train & test set</li>
<li>Encode the categorical object variables in both the train & test set</li>
<li>Perform Feature Engineering if necessary</li>
<li>Scale/Normalize the dataset if necessary</li>
</ol>

In [None]:
def dataframe_cleaning(df):
    # Before cleaning any data, it is important to transform Categorical values to numerical values
    # Retrieve categorical columns, which have data type as "object"
    df_object_columns = df.select_dtypes(include=['object']).columns
    # One-hot encoding for categorical variables
    df_encoded = pd.get_dummies(df, columns=df_object_columns)#, dummy_na=True)
    #A. Treat for missing values in both the train & test set
    imputer = KNNImputer(n_neighbors=15)
    #np array is created
    df_imputed = imputer.fit_transform(df_encoded)
    #back to dataframe
    df_without_nans = pd.DataFrame(data=df_imputed, columns=df_encoded.columns)
    
    return df_without_nans

In [None]:
#train data
surveydata_train_clean = dataframe_cleaning(surveydata_train)
traveldata_train_clean = dataframe_cleaning(traveldata_train)
#test data
surveydata_test_clean = dataframe_cleaning(surveydata_test)
traveldata_test_clean = dataframe_cleaning(traveldata_test)

In [None]:
#if a column is not present on the test set, then it is not important in the train set. Sans the target variable "Overall_Experience"
def shape_equalizer(df1, df2):
    """train, then test"""
    df1_columns = df1.columns
    df2_columns = df2.columns
    difference = list(set(df1_columns).difference(set(df2_columns)))
    if "Overall_Experience" in difference:
        difference.pop(difference.index("Overall_Experience"))
    df1 = df1.drop(difference, axis=1)
    return df1, df2

In [None]:
#survey data
surveydata_train_equalized, surveydata_test_equalized = shape_equalizer(surveydata_train_clean, surveydata_test_clean)
#travel data
traveldata_train_equalized, traveldata_test_equalized = shape_equalizer(traveldata_train_clean, traveldata_test_clean)

In [None]:
if surveydata_train_equalized.shape[0] == traveldata_train_equalized.shape[0] and surveydata_test_equalized.shape[0] == traveldata_test_equalized.shape[0]:
    print("Same number of rows between survey and travel data sets.")

In [None]:
# -1 due to target column "Overall_Experience"
if surveydata_train_equalized.shape[1]-1 == surveydata_test_equalized.shape[1] and traveldata_train_equalized.shape[1] == traveldata_test_equalized.shape[1]:
    print("Same number of columns between test and train data sets.")

In [None]:
#last, join the two datasets for train and the two datasets for test
#train
train_data = traveldata_train_equalized.merge(surveydata_train_equalized, on='ID')
#test
test_data = traveldata_test_equalized.merge(surveydata_test_equalized, on='ID')

## try some feature engineering

In [None]:
def feature_eng(df):
    df['Delay_per_Distance'] = df['Departure_Delay_in_Mins'] / df['Travel_Distance']
    poly = PolynomialFeatures(degree=2, include_bias=False)
    df_poly = poly.fit_transform(df[['Age', 'Travel_Distance']])
    return df

## Model Building

In [None]:
rfc = RandomForestClassifier(bootstrap= False,
                             ccp_alpha= 0.0,
                             class_weight= None,
                             criterion= 'gini',
                             max_depth= None,
                             max_features= 'auto',
                             max_leaf_nodes= None,
                             max_samples= None,
                             min_impurity_decrease= 0.0,
                             min_samples_leaf= 1,
                             min_samples_split= 2,
                             min_weight_fraction_leaf= 0.0,
                             n_estimators= 2500,
                             n_jobs= -1,
                             oob_score= False,
                             random_state= 42,
                             verbose= 0,
                             warm_start= False)

In [None]:
xgb = XGBClassifier(objective= 'binary:logistic',
                    use_label_encoder= None,
                    base_score= None,
                    booster= None,
                    callbacks= None,
                    colsample_bylevel= None,
                    colsample_bynode= None,
                    colsample_bytree= 0.5,
                    early_stopping_rounds= None,
                    enable_categorical= False,
                    eval_metric= None,
                    feature_types= None,
                    gamma= 0.25,
                    gpu_id= None,
                    grow_policy= None,
                    importance_type= None,
                    interaction_constraints= None,
                    learning_rate= 0.09999999999999999,
                    max_bin= None,
                    max_cat_threshold= None,
                    max_cat_to_onehot= None,
                    max_delta_step= None,
                    max_depth= 15,
                    max_leaves= None,
                    min_child_weight= 2,
                    monotone_constraints= None,
                    n_estimators= 2000,
                    n_jobs= -1,
                    num_parallel_tree= None,
                    predictor= None,
                    random_state= 42,
                    reg_alpha= 0.1,
                    reg_lambda= 0.2,
                    sampling_method= None,
                    scale_pos_weight= None,
                    subsample= 0.9,
                    tree_method= None,
                    validate_parameters= None,
                    verbosity= None)

In [None]:
ext = ExtraTreesClassifier(bootstrap= False,
                           ccp_alpha= 0.0,
                           class_weight= None,
                           criterion= 'entropy',
                           max_depth= None,
                           max_features= 'auto',
                           max_leaf_nodes= None,
                           max_samples= None,
                           min_impurity_decrease= 0.0,
                           min_samples_leaf= 1,
                           min_samples_split= 5,
                           min_weight_fraction_leaf= 0.0,
                           n_estimators= 2000,
                           n_jobs= -1,
                           oob_score= False,
                           random_state= 42,
                           verbose= 0,
                           warm_start= False)

In [None]:
lgbm = make_pipeline(
    StandardScaler(),
    lgb.LGBMClassifier(objective='binary', 
                       boosting_type='gbdt',
                       n_estimators=3000, 
                       learning_rate=0.3,
                       subsample_for_bin=200, # default 200,000
                       n_jobs=-1,
                       max_depth=-1,
                       random_state=42)
)

In [None]:
# save function for all
def results_to_csv(y_pred, var_name):
    data = test_data.copy()
    data['Overall_Experience'] = y_pred
    result = data[['ID', 'Overall_Experience']]
    #to integers
    result[['ID', 'Overall_Experience']] = result[['ID', 'Overall_Experience']].astype(int)
    #print head
    print(result.head(5))
    #save as csv
    name = "_".join(var_name.split('_')[-2:]) + '_result.csv'
    return result.to_csv(name, index=False)

#### something

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Standardize the features to have mean=0 and variance=1
scaler = StandardScaler()

# Create pipelines
#svm_pipeline = make_pipeline(scaler, svm)
rfc_pipeline = make_pipeline(scaler, rfc)
xgb_pipeline = make_pipeline(scaler, xgb)
ext_pipeline = make_pipeline(scaler, ext)
lgbm_pipeline = make_pipeline(scaler, lgbm)

# Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

base_models = list()
base_models.append(('rfc', RandomForestClassifier()))
base_models.append(('xgb', XGBClassifier()))
#base_models.append(('ext', ExtraTreesClassifier()))
base_models.append(('lgbm', make_pipeline(StandardScaler(), lgb.LGBMClassifier())))


meta_learner = XGBClassifier(n_estimators=2000, random_state=42, use_label_encoder=False)

stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5)



# Define the ensemble model
ensemble = VotingClassifier(estimators=[('Random Forest', rfc_pipeline),
                                        ('XGBoost', xgb_pipeline),
                                        #('ExtraTree', ext_pipeline),
                                        #('LightGBM', lgbm_pipeline),
                                        ('Stacked',stacked_model)], voting='soft')


#### model

In [None]:
train_data = feature_eng(train_data)
test_data = feature_eng(test_data)

In [None]:
# train, test, split for all
X = train_data.drop('Overall_Experience', axis=1)
y = train_data['Overall_Experience']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit the ensemble
ensemble.fit(X_train, y_train)
#ensemble.fit(X, y) #provides more data

In [None]:
# Train the model
y_pred_train = ensemble.predict(X_train)
#y_pred_train = ensemble.predict(X)

In [None]:
# Evaluate model on training data
print(confusion_matrix(y_train, y_pred_train))

In [None]:
# Make predictions on validation data
y_pred_val = ensemble.predict(X_val)
cm = confusion_matrix(y_val, y_pred_val)
#pretty plot
plt.figure(figsize=(2,2))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# Add labels (depends on your problem)
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.show()

In [None]:
[[TN,FP],[FN,TP]] = cm
Accuracy = (TN + TP) / (TN + FP + FN + TP)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1_score = 2 * (Precision * Recall) / (Precision + Recall)
maybe_acc = Accuracy - 0.003
target = 0.9571372
print(f"cm:\n{cm}\nAccuracy:\t{Accuracy:.7f}\nPrecision:\t{Precision:.7f}\nRecall:\t\t{Recall:.7f}\nF1_score:\t{F1_score:.7f}\n\nMaybe Acc:\t{maybe_acc:.7f}\nTarget:\t\t{target:.7f}")


In [None]:
# Make prediction
y_pred_test = ensemble.predict(test_data)
y_pred_test

In [None]:
results_to_csv(y_pred_test, "y_pred_test_voting_ensemble")