# **Predictive Analytics: Shinkansen Passenger Satisfaction**

## Data Preprocessing

**1. Import necessary libraries**

In [1]:
import pandas as pd
import numpy as np
import regex as re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.simplefilter("ignore")

**2. Load the training and test data separately**

In [2]:
#train data
surveydata_train = pd.read_csv("Surveydata_train.csv")
traveldata_train = pd.read_csv("Traveldata_train.csv")
#test data
surveydata_test = pd.read_csv("Surveydata_test.csv")
traveldata_test = pd.read_csv("Traveldata_test.csv")

**3. Understand the data (check for each of the following in both the train and test dataset)**
<ol>
<li>Check a sample of the data</li>
<li>Use the info() and describe() functions for more information</li>
<li>Look for the presence of null values in the dataset</li>
<li>Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns</li>
</ol>

In [3]:
#A. Check a sample of the data
surveydata_train.sample(5)

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
53281,98853282,0,Needs Improvement,Ordinary,Needs Improvement,Needs Improvement,Convenient,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Excellent,Good,Needs Improvement
47180,98847181,0,Needs Improvement,Ordinary,Excellent,Needs Improvement,Manageable,Acceptable,Needs Improvement,Poor,Acceptable,Good,Good,Good,Good,Excellent,Acceptable
19607,98819608,1,Extremely Poor,Green Car,Extremely Poor,Extremely Poor,Inconvenient,Good,Excellent,Good,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent
53247,98853248,0,Acceptable,Ordinary,Poor,Needs Improvement,Inconvenient,Acceptable,Acceptable,Acceptable,Acceptable,Poor,Poor,Acceptable,Good,Needs Improvement,Acceptable
27621,98827622,1,Good,Green Car,Needs Improvement,Good,Manageable,Poor,Good,Excellent,Poor,Needs Improvement,Good,Needs Improvement,Needs Improvement,Needs Improvement,Poor


In [4]:
#A. Check a sample of the data
traveldata_train.sample(5)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
22983,98822984,Female,Loyal Customer,39.0,Personal Travel,Eco,1672,0.0,0.0
92251,98892252,Female,Loyal Customer,36.0,Personal Travel,Eco,1725,22.0,22.0
25379,98825380,Male,Loyal Customer,68.0,Personal Travel,Eco,4302,0.0,0.0
87273,98887274,Female,Loyal Customer,22.0,Business Travel,Eco,2333,0.0,2.0
44599,98844600,Male,Loyal Customer,70.0,Personal Travel,Eco,1867,22.0,12.0


In [5]:
#B. Use the info() and describe() functions for more information
surveydata_train.info()
surveydata_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       94379 non-null  int64 
 1   Overall_Experience       94379 non-null  int64 
 2   Seat_Comfort             94318 non-null  object
 3   Seat_Class               94379 non-null  object
 4   Arrival_Time_Convenient  85449 non-null  object
 5   Catering                 85638 non-null  object
 6   Platform_Location        94349 non-null  object
 7   Onboard_Wifi_Service     94349 non-null  object
 8   Onboard_Entertainment    94361 non-null  object
 9   Online_Support           94288 non-null  object
 10  Ease_of_Online_Booking   94306 non-null  object
 11  Onboard_Service          86778 non-null  object
 12  Legroom                  94289 non-null  object
 13  Baggage_Handling         94237 non-null  object
 14  CheckIn_Service          94302 non-nul

Unnamed: 0,ID,Overall_Experience
count,94379.0,94379.0
mean,98847190.0,0.546658
std,27245.01,0.497821
min,98800000.0,0.0
25%,98823600.0,0.0
50%,98847190.0,1.0
75%,98870780.0,1.0
max,98894380.0,1.0


In [6]:
#B. Use the info() and describe() functions for more information
traveldata_train.info()
traveldata_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


Unnamed: 0,ID,Age,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
count,94379.0,94346.0,94379.0,94322.0,94022.0
mean,98847190.0,39.419647,1978.888185,14.647092,15.005222
std,27245.01,15.116632,1027.961019,38.138781,38.439409
min,98800000.0,7.0,50.0,0.0,0.0
25%,98823600.0,27.0,1359.0,0.0,0.0
50%,98847190.0,40.0,1923.0,0.0,0.0
75%,98870780.0,51.0,2538.0,12.0,13.0
max,98894380.0,85.0,6951.0,1592.0,1584.0


In [7]:
#C. Look for the presence of null values in the dataset
surveydata_train.isnull().values.any()

True

In [8]:
#C. Look for the presence of null values in the dataset
traveldata_train.isnull().values.any()

True

In [9]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
surveydata_train.describe().columns.astype(str).str.contains("($|#)").any()

True

In [10]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
traveldata_train.describe().columns.astype(str).str.contains("($|#)").any()

True

**4. Clean the data**
<ol>
<li>Treat for missing values in both the train & test set</li>
<li>Remove bad data values in both the train & test set</li>
<li>Encode the categorical object variables in both the train & test set</li>
<li>Perform Feature Engineering if necessary</li>
<li>Scale/Normalize the dataset if necessary</li>
</ol>

In [11]:
def dataframe_cleaning(df):
    # Before cleaning any data, it is important to transform Categorical values to numerical values
    # Retrieve categorical columns, which have data type as "object"
    df_object_columns = df.select_dtypes(include=['object']).columns
    # One-hot encoding for categorical variables
    df_encoded = pd.get_dummies(df, columns=df_object_columns, dummy_na=True)
    #A. Treat for missing values in both the train & test set
    imputer = KNNImputer(n_neighbors=15)
    #np array is created
    df_imputed = imputer.fit_transform(df_encoded)
    #back to dataframe
    df_without_nans = pd.DataFrame(data=df_imputed, columns=df_encoded.columns)
    
    return df_without_nans

In [12]:
#train data
surveydata_train_clean = dataframe_cleaning(surveydata_train)
traveldata_train_clean = dataframe_cleaning(traveldata_train)
#test data
surveydata_test_clean = dataframe_cleaning(surveydata_test)
traveldata_test_clean = dataframe_cleaning(traveldata_test)

In [13]:
#if a column is not present on the test set, then it is not important in the train set. Sans the target variable "Overall_Experience"
def shape_equalizer(df1, df2):
    """train, then test"""
    df1_columns = df1.columns
    df2_columns = df2.columns
    difference = list(set(df1_columns).difference(set(df2_columns)))
    if "Overall_Experience" in difference:
        difference.pop(difference.index("Overall_Experience"))
    df1 = df1.drop(difference, axis=1)
    return df1, df2

In [14]:
#survey data
surveydata_train_equalized, surveydata_test_equalized = shape_equalizer(surveydata_train_clean, surveydata_test_clean)
#travel data
traveldata_train_equalized, traveldata_test_equalized = shape_equalizer(traveldata_train_clean, traveldata_test_clean)

In [15]:
if surveydata_train_equalized.shape[0] == traveldata_train_equalized.shape[0] and surveydata_test_equalized.shape[0] == traveldata_test_equalized.shape[0]:
    print("Same number of rows between survey and travel data sets.")

Same number of rows between survey and travel data sets.


In [16]:
# -1 due to target column "Overall_Experience"
if surveydata_train_equalized.shape[1]-1 == surveydata_test_equalized.shape[1] and traveldata_train_equalized.shape[1] == traveldata_test_equalized.shape[1]:
    print("Same number of columns between test and train data sets.")

Same number of columns between test and train data sets.


In [37]:
#last, join the two datasets for train and the two datasets for test
#train
train_data = traveldata_train_equalized.merge(surveydata_train_equalized, on='ID')
#test
test_data = traveldata_test_equalized.merge(surveydata_test_equalized, on='ID')

## Model Building

In [None]:
# train, test, split for all
X = train_data.drop('Overall_Experience', axis=1)
y = train_data['Overall_Experience']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# save function for all
def results_to_csv(y_pred, var_name):
    data = test_data.copy()
    data['Overall_Experience'] = y_pred
    result = data[['ID', 'Overall_Experience']]
    #to integers
    result[['ID', 'Overall_Experience']] = result[['ID', 'Overall_Experience']].astype(int)
    #print head
    print(result.head(5))
    #save as csv
    name = "_".join(var_name.split('_')[-2:]) + '_result.csv'
    return result.to_csv(name, index=False)

#### ada boost

In [None]:
# Fit the model
tree = DecisionTreeClassifier()

ada = AdaBoostClassifier(n_estimators=1000,
                         estimator=tree,
                         learning_rate=0.01,
                         random_state=42
                        )

ada.fit(X_train, y_train)

In [None]:
# Train the model
y_pred_train = ada.predict(X_train)

In [None]:
# Evaluate model on training data
print(confusion_matrix(y_train, y_pred_train))

In [None]:
# Make predictions on validation data
y_pred_val = ada.predict(X_val)
cm = confusion_matrix(y_val, y_pred_val)
#pretty plot
plt.figure(figsize=(2,2))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

# Add labels (depends on your problem)
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.show()

In [None]:
[[TN,FP],[FN,TP]] = cm
Accuracy = (TN + TP) / (TN + FP + FN + TP)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1_score = 2 * (Precision * Recall) / (Precision + Recall)
maybe_acc = Accuracy - 0.003
target = 0.9547778
print(f"cm:\n{cm}\nAccuracy:\t{Accuracy:.7f}\nPrecision:\t{Precision:.7f}\nRecall:\t\t{Recall:.7f}\nF1_score:\t{F1_score:.7f}\n\nMaybe Acc:\t{maybe_acc:.7f}\nTarget:\t\t{target:.7f}")


In [None]:
# Make prediction
y_pred_test = ada.predict(test_data)
y_pred_test

#### RandomizedSearchCV 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': range(50, 500, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'base_estimator__max_depth': range(1, 10),
    'base_estimator__criterion': ['gini', 'entropy']
}

# Create a ada boost classifier object
ada_clf = AdaBoostClassifier(estimator=tree, random_state=42)

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(ada_clf,
                                   param_distributions=param_grid,
                                   n_iter=100,
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   cv=5,
                                   verbose=2,
                                   random_state=42)

# Fit it to the data
random_search.fit(X_train, y_train)


In [None]:
# Print the best parameters and the corresponding score
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

#### best model

In [None]:
my_params=ada_clf.get_params()
my_params

In [None]:
dt_params = {
    'max_depth': my_params.get('base_estimator__max_depth'),
    'criterion': my_params.get('base_estimator__criterion')
}

base_estimator = DecisionTreeClassifier(**dt_params, random_state=42)

In [None]:
ada_params = {
    'n_estimators': my_params.get('n_estimators'),
    'learning_rate': my_params.get('learning_rate')
}

updated_ada = AdaBoostClassifier(base_estimator=base_estimator, **ada_params)

updated_ada.fit(X_train, y_train)

In [None]:
# Train the model
y_pred_train_updated = updated_ada.predict(X_train)

In [None]:
# Evaluate model on training data
print(confusion_matrix(y_train, y_pred_train_updated))

In [None]:
# Make predictions on validation data
y_pred_val_updated = updated_ada.predict(X_val)
cm_updated = confusion_matrix(y_val, y_pred_val_updated)

#pretty plot
plt.figure(figsize=(2,2))
sns.heatmap(cm_updated, annot=True, fmt='d', cmap='Blues')
# Add labels (depends on your problem)
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.show()

In [None]:
[[TN,FP],[FN,TP]] = cm_updated
Accuracy = (TN + TP) / (TN + FP + FN + TP)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
F1_score = 2 * (Precision * Recall) / (Precision + Recall)
maybe_acc = Accuracy - 0.003
target = 0.9555081
print(f"cm:\n{cm_updated}\nAccuracy:\t{Accuracy:.7f}\nPrecision:\t{Precision:.7f}\nRecall:\t\t{Recall:.7f}\nF1_score:\t{F1_score:.7f}\n\nMaybe Acc:\t{maybe_acc:.7f}\nTarget:\t\t{target:.7f}")


In [None]:
# Make prediction
y_pred_test_updated_ada = updated_ada.predict(test_data)
y_pred_test_updated_ada


In [None]:
results_to_csv(y_pred_test_updated_ada, "y_pred_test_updated_ada")


#### Including normalization & pca

In [17]:
#last, join the two datasets for train and the two datasets for test
#train
train_df = traveldata_train_equalized.merge(surveydata_train_equalized, on='ID')
#test
test_df = traveldata_test_equalized.merge(surveydata_test_equalized, on='ID')

In [31]:
#drop target
X = train_df.drop('Overall_Experience', axis=1)
y = train_df['Overall_Experience']
ID_col = train_df['ID']

In [19]:
scaler = StandardScaler()

# Scale objects
#train data
X_train_scaled = scaler.fit_transform(X_train)

#test data
test_scaled = scaler.fit_transform(test_df)

# Convert the result back to a dataframe
#train
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

#test
test_scaled = pd.DataFrame(test_scaled, columns=test_df.columns)

In [20]:
# Apply PCA to the features in the training data
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)

# Apply the same PCA transformation to the test data
X_test_pca = pca.transform(test_scaled)

In [21]:
base_estimator = DecisionTreeClassifier(criterion='entropy', random_state=42)

In [22]:
updated_ada = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=350, learning_rate=0.0999999999)

updated_ada.fit(X_train_pca, y_train)

In [24]:
# Train the model
y_pred_train_updated = updated_ada.predict(X_train_pca)

In [25]:
# Evaluate model on training data
print(confusion_matrix(y_train, y_pred_train_updated))

[[42786     0]
 [    0 51593]]


In [33]:
# Make prediction
y_pred_test_updated_ada = updated_ada.predict(X_test_pca)
y_pred_test_updated_ada


array([1., 1., 1., ..., 1., 1., 0.])

In [38]:
results_to_csv(y_pred_test_updated_ada, "y_pred_test_updated_ada")

         ID  Overall_Experience
0  99900001                   1
1  99900002                   1
2  99900003                   1
3  99900004                   0
4  99900005                   1
