# **Predictive Analytics: Shinkansen Passenger Satisfaction**

## Data Preprocessing

**1. Import necessary libraries**

In [4]:
import pandas as pd
import numpy as np
import regex as re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.simplefilter("ignore")

**2. Load the training and test data separately**

In [5]:
#train data
surveydata_train = pd.read_csv("Surveydata_train.csv")
traveldata_train = pd.read_csv("Traveldata_train.csv")
#test data
surveydata_test = pd.read_csv("Surveydata_test.csv")
traveldata_test = pd.read_csv("Traveldata_test.csv")

**3. Understand the data (check for each of the following in both the train and test dataset)**
<ol>
<li>Check a sample of the data</li>
<li>Use the info() and describe() functions for more information</li>
<li>Look for the presence of null values in the dataset</li>
<li>Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns</li>
</ol>

In [6]:
#A. Check a sample of the data
surveydata_train.sample(5)

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
91650,98891651,1,Extremely Poor,Ordinary,Excellent,Extremely Poor,Needs Improvement,Needs Improvement,Acceptable,Good,Poor,Poor,Extremely Poor,Good,Acceptable,Poor,Acceptable
19026,98819027,1,Extremely Poor,Green Car,Excellent,Extremely Poor,Convenient,Good,Excellent,Good,Poor,Poor,Extremely Poor,Poor,Acceptable,Poor,Acceptable
18222,98818223,0,Needs Improvement,Green Car,Acceptable,Acceptable,Manageable,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Good
77553,98877554,1,Poor,Ordinary,Poor,Poor,Inconvenient,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Acceptable
30953,98830954,0,Acceptable,Ordinary,Acceptable,Acceptable,Manageable,Acceptable,Good,Acceptable,Poor,Good,Acceptable,Acceptable,Acceptable,Poor,Acceptable


In [7]:
#A. Check a sample of the data
traveldata_train.sample(5)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
34787,98834788,Male,Loyal Customer,21.0,Personal Travel,Eco,2028,73.0,77.0
94215,98894216,Male,Loyal Customer,24.0,Business Travel,Business,4199,29.0,10.0
20144,98820145,Male,Loyal Customer,55.0,Personal Travel,Eco,1909,0.0,12.0
88457,98888458,Female,Loyal Customer,31.0,Business Travel,Business,3396,0.0,0.0
52787,98852788,Male,Loyal Customer,16.0,Personal Travel,Eco,2780,12.0,0.0


In [8]:
#B. Use the info() and describe() functions for more information
surveydata_train.info()
surveydata_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ID                       94379 non-null  int64 
 1   Overall_Experience       94379 non-null  int64 
 2   Seat_Comfort             94318 non-null  object
 3   Seat_Class               94379 non-null  object
 4   Arrival_Time_Convenient  85449 non-null  object
 5   Catering                 85638 non-null  object
 6   Platform_Location        94349 non-null  object
 7   Onboard_Wifi_Service     94349 non-null  object
 8   Onboard_Entertainment    94361 non-null  object
 9   Online_Support           94288 non-null  object
 10  Ease_of_Online_Booking   94306 non-null  object
 11  Onboard_Service          86778 non-null  object
 12  Legroom                  94289 non-null  object
 13  Baggage_Handling         94237 non-null  object
 14  CheckIn_Service          94302 non-nul

Unnamed: 0,ID,Overall_Experience
count,94379.0,94379.0
mean,98847190.0,0.546658
std,27245.01,0.497821
min,98800000.0,0.0
25%,98823600.0,0.0
50%,98847190.0,1.0
75%,98870780.0,1.0
max,98894380.0,1.0


In [9]:
#B. Use the info() and describe() functions for more information
traveldata_train.info()
traveldata_train.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94379 entries, 0 to 94378
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
dtypes: float64(3), int64(2), object(4)
memory usage: 6.5+ MB


Unnamed: 0,ID,Age,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
count,94379.0,94346.0,94379.0,94322.0,94022.0
mean,98847190.0,39.419647,1978.888185,14.647092,15.005222
std,27245.01,15.116632,1027.961019,38.138781,38.439409
min,98800000.0,7.0,50.0,0.0,0.0
25%,98823600.0,27.0,1359.0,0.0,0.0
50%,98847190.0,40.0,1923.0,0.0,0.0
75%,98870780.0,51.0,2538.0,12.0,13.0
max,98894380.0,85.0,6951.0,1592.0,1584.0


In [10]:
#C. Look for the presence of null values in the dataset
surveydata_train.isnull().values.any()

True

In [11]:
#C. Look for the presence of null values in the dataset
traveldata_train.isnull().values.any()

True

In [12]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
surveydata_train.describe().columns.astype(str).str.contains("($|#)").any()

True

In [13]:
#D. Look for the presence of bad data or unwanted characters like "$" or "#" in the numerical columns
traveldata_train.describe().columns.astype(str).str.contains("($|#)").any()

True

In [14]:
surveydata_train = surveydata_train.replace({'#': '', '$': ''}, regex=True)
traveldata_train = traveldata_train.replace({'#': '', '$': ''}, regex=True)

**4. Clean the data**
<ol>
<li>Treat for missing values in both the train & test set</li>
<li>Remove bad data values in both the train & test set</li>
<li>Encode the categorical object variables in both the train & test set</li>
<li>Perform Feature Engineering if necessary</li>
<li>Scale/Normalize the dataset if necessary</li>
</ol>

In [109]:
def dataframe_cleaning(df):
    # Before cleaning any data, it is important to transform Categorical values to numerical values
    # Retrieve categorical columns, which have data type as "object"
    df_object_columns = df.select_dtypes(include=['object']).columns
    # One-hot encoding for categorical variables
    df_encoded = pd.get_dummies(df, columns=df_object_columns, dummy_na=True)
    #A. Treat for missing values in both the train & test set
    imputer = KNNImputer(n_neighbors=20, 
                         weights='uniform', 
                         metric='nan_euclidean', 
                         missing_values=np.nan, 
                         add_indicator=False)
    #np array is created
    df_imputed = imputer.fit_transform(df_encoded)
    #back to dataframe
    df_without_nans = pd.DataFrame(data=df_imputed, columns=df_encoded.columns)
    
    return df_without_nans

In [110]:
#train data
surveydata_train_clean = dataframe_cleaning(surveydata_train)
traveldata_train_clean = dataframe_cleaning(traveldata_train)
#test data
surveydata_test_clean = dataframe_cleaning(surveydata_test)
traveldata_test_clean = dataframe_cleaning(traveldata_test)

In [111]:
#if a column is not present on the test set, then it is not important in the train set. Sans the target variable "Overall_Experience"
def shape_equalizer(df1, df2):
    """train, then test"""
    df1_columns = df1.columns
    df2_columns = df2.columns
    difference = list(set(df1_columns).difference(set(df2_columns)))
    if "Overall_Experience" in difference:
        difference.pop(difference.index("Overall_Experience"))
    df1 = df1.drop(difference, axis=1)
    return df1, df2

In [112]:
#survey data
surveydata_train_equalized, surveydata_test_equalized = shape_equalizer(surveydata_train_clean, surveydata_test_clean)
#travel data
traveldata_train_equalized, traveldata_test_equalized = shape_equalizer(traveldata_train_clean, traveldata_test_clean)

In [113]:
if surveydata_train_equalized.shape[0] == traveldata_train_equalized.shape[0] and surveydata_test_equalized.shape[0] == traveldata_test_equalized.shape[0]:
    print("Same number of rows between survey and travel data sets.")

Same number of rows between survey and travel data sets.


In [114]:
# -1 due to target column "Overall_Experience"
if surveydata_train_equalized.shape[1]-1 == surveydata_test_equalized.shape[1] and traveldata_train_equalized.shape[1] == traveldata_test_equalized.shape[1]:
    print("Same number of columns between test and train data sets.")

Same number of columns between test and train data sets.


In [200]:
#last, join the two datasets for train and the two datasets for test
#train
train_data = traveldata_train_equalized.merge(surveydata_train_equalized, on='ID')
#test
test_data = traveldata_test_equalized.merge(surveydata_test_equalized, on='ID')

In [108]:
train_data

Unnamed: 0,ID,Age,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Gender_Female,Gender_Male,Gender_nan,Customer_Type_Disloyal Customer,Customer_Type_Loyal Customer,...,Cleanliness_Needs Improvement,Cleanliness_Poor,Cleanliness_nan,Online_Boarding_Acceptable,Online_Boarding_Excellent,Online_Boarding_Extremely Poor,Online_Boarding_Good,Online_Boarding_Needs Improvement,Online_Boarding_Poor,Online_Boarding_nan
0,98800001.0,52.0,272.0,0.0,5.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,98800002.0,48.0,2200.0,9.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,98800003.0,43.0,1061.0,77.0,119.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,98800004.0,44.0,780.0,13.0,18.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,98800005.0,50.0,1981.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,98894375.0,32.0,1357.0,83.0,125.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
94375,98894376.0,44.0,592.0,5.0,11.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
94376,98894377.0,63.0,2794.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
94377,98894378.0,16.0,2744.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Model Building

In [174]:
# train, test, split for all
X = train_data.drop('Overall_Experience', axis=1)
y = train_data['Overall_Experience']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [117]:
# save function for all
def results_to_csv(y_pred, var_name):
    data = test_data.copy()
    data['Overall_Experience'] = y_pred
    result = data[['ID', 'Overall_Experience']]
    #to integers
    result[['ID', 'Overall_Experience']] = result[['ID', 'Overall_Experience']].astype(int)
    #print head
    print(result.head(5))
    #save as csv
    name = "_".join(var_name.split('_')[-2:]) + '_result.csv'
    return result.to_csv(name, index=False)

In [180]:
sc = RobustScaler()
X2 = sc.fit_transform(X)
test_data2 = sc.transform(test_data)

In [119]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### linear regression

In [209]:
# create model
left_clf = LogisticRegression(C=0.00005,
                            solver='liblinear',
                            penalty='l1',
                            class_weight=None,
                            fit_intercept=True
                           )
# fit model
left_clf.fit(X2, y)
# train the model
y_pred_train_left_clf = left_clf.predict(X2)

In [210]:
# create model
middle_clf = LogisticRegression(C=0.00005,
                            solver='lbfgs',
                            penalty='l2',
                            class_weight=None,
                            fit_intercept=True
                           )
# fit model
middle_clf.fit(X2, y)
# train the model
y_pred_train_middle_clf = middle_clf.predict(X2)

In [211]:
# create model
right_clf = LogisticRegression(C=0.00005,
                            solver='sag',
                            penalty='l2',
                            class_weight=None,
                            fit_intercept=True
                           )
# fit model
right_clf.fit(X, y)
# train the model
y_pred_train_right_clf = right_clf.predict(X)

In [212]:
# Get the predicted probabilities for the positive class
probs1 = left_clf.predict_proba(test_data2)[:, 1]
probs2 = middle_clf.predict_proba(test_data2)[:, 1]
probs3 = right_clf.predict_proba(test_data)[:, 1]

In [225]:
# Average the predicted probabilities
avg_probs = np.mean([probs1, probs2, probs3], axis=0)

# Convert averaged probabilities to class predictions
predictions = np.where(avg_probs > 0.499, 1, 0)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [228]:
data = test_data.copy()
data['Overall_Experience'] = predictions
result = data[['ID', 'Overall_Experience']]
result[['ID', 'Overall_Experience']] = result[['ID', 'Overall_Experience']].astype(int)

In [196]:
#save as csv
name = "_".join(var_name.split('_')[-2:]) + 'predictions_result.csv'
return result.to_csv(name, index=False)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices