# **Predictive Analytics: Shinkansen Passenger Satisfaction**

## Data Preprocessing

**1. Import necessary libraries**

In [320]:
import pandas as pd
import regex as re
from sklearn.impute import KNNImputer
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


warnings.simplefilter("ignore")

**2. Load the training and test data separately**

In [321]:
#train data
surveydata_train = pd.read_csv("Surveydata_train.csv")
traveldata_train = pd.read_csv("Traveldata_train.csv")
#test data
surveydata_test = pd.read_csv("Surveydata_test.csv")
traveldata_test = pd.read_csv("Traveldata_test.csv")

**3. Understand the data (check for each of the following in both the train and test dataset)**
<ol>
<li>Check a sample of the data</li>
<li>Use the info() and describe() functions for more information</li>
<li>Look for the presence of null values in the dataset</li>
<li>Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns</li>
</ol>

In [322]:
#A. Check a sample of the data
surveydata_train.sample(5)

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
76932,98876933,1,Excellent,Ordinary,Excellent,Excellent,Very Convenient,Acceptable,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Good,Excellent,Acceptable
25894,98825895,0,Acceptable,Ordinary,Excellent,Acceptable,Convenient,Needs Improvement,Good,Excellent,Poor,Poor,Acceptable,Poor,Good,Poor,Acceptable
82067,98882068,1,Poor,Green Car,Poor,Poor,Inconvenient,Good,Poor,Acceptable,Excellent,Excellent,Excellent,Acceptable,Poor,Excellent,Excellent
12312,98812313,1,Good,Ordinary,Good,Good,Manageable,Excellent,Good,Needs Improvement,Excellent,Acceptable,Needs Improvement,Acceptable,Acceptable,Poor,Excellent
40505,98840506,0,Needs Improvement,Green Car,Good,Needs Improvement,Inconvenient,Excellent,Needs Improvement,Excellent,Excellent,Needs Improvement,Acceptable,Acceptable,Good,Needs Improvement,Excellent


In [323]:
#A. Check a sample of the data
traveldata_train.sample(5)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
7917,98807918,Female,Loyal Customer,11.0,Business Travel,Eco,2193,3.0,14.0
7127,98807128,Male,Loyal Customer,35.0,,Eco,1282,14.0,2.0
42681,98842682,Male,Loyal Customer,55.0,Personal Travel,Eco,1105,75.0,66.0
44388,98844389,Female,Loyal Customer,53.0,,Business,1999,7.0,0.0
29272,98829273,Female,Loyal Customer,47.0,Business Travel,Business,1177,68.0,84.0


In [324]:
#B. Use the info() and describe() functions for more information
surveydata_train.info()
surveydata_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       94379 non-null  int64 
 1   Overall_Experience       94379 non-null  int64 
 2   Seat_Comfort             94318 non-null  object
 3   Seat_Class               94379 non-null  object
 4   Arrival_Time_Convenient  85449 non-null  object
 5   Catering                 85638 non-null  object
 6   Platform_Location        94349 non-null  object
 7   Onboard_Wifi_Service     94349 non-null  object
 8   Onboard_Entertainment    94361 non-null  object
 9   Online_Support           94288 non-null  object
 10  Ease_of_Online_Booking   94306 non-null  object
 11  Onboard_Service          86778 non-null  object
 12  Legroom                  94289 non-null  object
 13  Baggage_Handling         94237 non-null  object
 14  CheckIn_Service          94302 non-nul

Unnamed: 0,ID,Overall_Experience
count,94379.0,94379.0
mean,98847190.0,0.546658
std,27245.01,0.497821
min,98800000.0,0.0
25%,98823600.0,0.0
50%,98847190.0,1.0
75%,98870780.0,1.0
max,98894380.0,1.0


In [325]:
#B. Use the info() and describe() functions for more information
traveldata_train.info()
traveldata_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


Unnamed: 0,ID,Age,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
count,94379.0,94346.0,94379.0,94322.0,94022.0
mean,98847190.0,39.419647,1978.888185,14.647092,15.005222
std,27245.01,15.116632,1027.961019,38.138781,38.439409
min,98800000.0,7.0,50.0,0.0,0.0
25%,98823600.0,27.0,1359.0,0.0,0.0
50%,98847190.0,40.0,1923.0,0.0,0.0
75%,98870780.0,51.0,2538.0,12.0,13.0
max,98894380.0,85.0,6951.0,1592.0,1584.0


In [326]:
#C. Look for the presence of null values in the dataset
surveydata_train.isnull().values.any()

True

In [327]:
#C. Look for the presence of null values in the dataset
traveldata_train.isnull().values.any()

True

In [328]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
surveydata_train.describe().columns.astype(str).str.contains("($|#)").any()

True

In [329]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
traveldata_train.describe().columns.astype(str).str.contains("($|#)").any()

True

**4. Clean the data**
<ol>
<li>Treat for missing values in both the train & test set</li>
<li>Remove bad data values in both the train & test set</li>
<li>Encode the categorical object variables in both the train & test set</li>
<li>Perform Feature Engineering if necessary</li>
<li>Scale/Normalize the dataset if necessary</li>
</ol>

In [330]:
def dataframe_cleaning(df):
    # Before cleaning any data, it is important to transform Categorical values to numerical values
    # Retrieve categorical columns, which have data type as "object"
    df_object_columns = df.select_dtypes(include=['object']).columns
    # One-hot encoding for categorical variables
    df_encoded = pd.get_dummies(df, columns=df_object_columns, dummy_na=True)
    #A. Treat for missing values in both the train & test set
    imputer = KNNImputer(n_neighbors=3)
    #np array is created
    df_imputed = imputer.fit_transform(df_encoded)
    #back to dataframe
    df_without_nans = pd.DataFrame(data=df_imputed, columns=df_encoded.columns)
    
    return df_without_nans

In [331]:
#train data
surveydata_train_clean = dataframe_cleaning(surveydata_train)
traveldata_train_clean = dataframe_cleaning(traveldata_train)
#test data
surveydata_test_clean = dataframe_cleaning(surveydata_test)
traveldata_test_clean = dataframe_cleaning(traveldata_test)

In [332]:
#if a column is not present on the test set, then it is not important in the train set. Sans the target variable "Overall_Experience"
def shape_equalizer(df1, df2):
    """train, then test"""
    df1_columns = df1.columns
    df2_columns = df2.columns
    difference = list(set(df1_columns).difference(set(df2_columns)))
    if "Overall_Experience" in difference:
        difference.pop(difference.index("Overall_Experience"))
    df1 = df1.drop(difference, axis=1)
    return df1, df2

In [333]:
#survey data
surveydata_train_equalized, surveydata_test_equalized = shape_equalizer(surveydata_train_clean, surveydata_test_clean)
#travel data
traveldata_train_equalized, traveldata_test_equalized = shape_equalizer(traveldata_train_clean, traveldata_test_clean)

In [334]:
if surveydata_train_equalized.shape[0] == traveldata_train_equalized.shape[0] and surveydata_test_equalized.shape[0] == traveldata_test_equalized.shape[0]:
    print("Same number of rows between survey and travel data sets.")

Same number of rows between survey and travel data sets.


In [335]:
# -1 due to target column "Overall_Experience"
if surveydata_train_equalized.shape[1]-1 == surveydata_test_equalized.shape[1] and traveldata_train_equalized.shape[1] == traveldata_test_equalized.shape[1]:
    print("Same number of columns between test and train data sets.")

Same number of columns between test and train data sets.


In [336]:
#last, join the two datasets for train and the two datasets for test
#train
train_data = traveldata_train_equalized.merge(surveydata_train_equalized, on='ID')
#test
test_data = traveldata_test_equalized.merge(surveydata_test_equalized, on='ID')

## Model Building

**4. Algorithms to try for this model:**
<ol>
<li>random forest</li>
<li>logistic regression</li>
<li>xgboost</li>
<li>KNN</li>
</ol>

In [337]:
# train, test, split for all
X = train_data.drop('Overall_Experience', axis=1)
y = train_data['Overall_Experience']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [356]:
# save function for all
def results_to_csv(y_pred, var_name):
    data = test_data.copy()
    data['Overall_Experience'] = y_pred
    result = data[['ID', 'Overall_Experience']]
    #to integers
    result[['ID', 'Overall_Experience']] = result[['ID', 'Overall_Experience']].astype(int)
    #print head
    print(result.head(5))
    #save as csv
    name = "_".join(var_name.split('_')[-2:]) + '_result.csv'
    return result.to_csv(name, index=False)

#### random forest

In [277]:
# Fit the model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

In [278]:
# Train the model
y_pred_train_random_forest = random_forest_model.predict(X_train)

In [284]:
# Evaluate model on training data
print(confusion_matrix(y_train, y_pred_train_random_forest))

[[34381     0]
 [    0 41122]]


In [285]:
# Make predictions on validation data
y_pred_val_random_forest = random_forest_model.predict(X_val)
print(confusion_matrix(y_val, y_pred_val_random_forest))

[[8014  391]
 [ 512 9959]]


In [317]:
# Make prediction
y_pred_test_random_forest = random_forest_model.predict(test_data)
y_pred_test_random_forest

array([1., 1., 1., ..., 0., 1., 0.])

In [319]:
results_to_csv(y_pred, var_name)


Unnamed: 0,ID,Overall_Experience
0,99900001,1
1,99900002,1
2,99900003,1
3,99900004,0
4,99900005,1


#### logistic regression

In [341]:
# Fit the model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

In [342]:
# Train the model
y_pred_train_logistic_regression = logistic_regression_model.predict(X_train)

In [343]:
# Evaluate model on training data
print(confusion_matrix(y_train, y_pred_train_logistic_regression))

[[    0 34381]
 [    0 41122]]


In [344]:
# Make prediction
y_pred_test_logistic_regression = logistic_regression_model.predict(test_data)
y_pred_test_logistic_regression

array([1., 1., 1., ..., 1., 1., 1.])

#### xgboost

In [270]:
# Fit the model

In [271]:
# Train the model

In [None]:
# Evaluate model on training data

In [272]:
# Make predictions

#### KNN

In [270]:
# Fit the model

In [271]:
# Train the model

In [None]:
# Evaluate model on training data

In [272]:
# Make predictions

## File to submit