#Problem Statement

The goal of the problem is to predict whether a passenger was satisfied or not considering their overall experience of traveling on the Shinkansen Bullet Train.

#Loading Data & Libraries

In [1]:
import warnings # Used to ignore the warning given as output of the code
warnings.filterwarnings('ignore')

import numpy as np  # Basic libraries of python for numeric and dataframe computations
import pandas as pd

import matplotlib.pyplot as plt # Basic library for data visualization
import seaborn as sns # Slightly advanced library for data visualization

from collections import defaultdict # A dictionary output that does not raise a key error

from sklearn.metrics import mean_squared_error  # A performance metrics in sklearn

In [None]:
#Training Data
survey_train = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Hackathon/Surveydata_train.csv')
travel_train = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Hackathon/Traveldata_train.csv')

In [None]:
#Test Data
survey_test = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Hackathon/Surveydata_test.csv')
travel_test = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Hackathon/Traveldata_test.csv')

In [None]:
display(survey_train.head(1),travel_train.head(1),survey_test.head(1),travel_test.head(1))

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,0,Needs Improvement,Green Car,Excellent,Excellent,Very Convenient,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor


Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0


Unnamed: 0,ID,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,99900001,Acceptable,Green Car,Acceptable,Acceptable,Manageable,Needs Improvement,Excellent,Good,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Poor


Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins
0,99900001,Female,,36.0,Business Travel,Business,532,0.0,0.0


In [None]:
print(survey_train.shape,travel_train.shape,survey_test.shape,travel_test.shape)

(94379, 17) (94379, 9) (35602, 16) (35602, 9)


#Cleanup & EDA

In [None]:
#merge datasets on ID
merged_data_train = pd.merge(travel_train, survey_train, on='ID', how = 'left')

#for some unfathomable reason, the test sets will not merge properly
merged_data_test = pd.merge(travel_test, survey_test, on=['ID'], how = 'left')

In [None]:
merged_data_train.head(1)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,...,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor


In [None]:
merged_data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94379 entries, 0 to 94378
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       94379 non-null  int64  
 1   Gender                   94302 non-null  object 
 2   Customer_Type            85428 non-null  object 
 3   Age                      94346 non-null  float64
 4   Type_Travel              85153 non-null  object 
 5   Travel_Class             94379 non-null  object 
 6   Travel_Distance          94379 non-null  int64  
 7   Departure_Delay_in_Mins  94322 non-null  float64
 8   Arrival_Delay_in_Mins    94022 non-null  float64
 9   Overall_Experience       94379 non-null  int64  
 10  Seat_Comfort             94318 non-null  object 
 11  Seat_Class               94379 non-null  object 
 12  Arrival_Time_Convenient  85449 non-null  object 
 13  Catering                 85638 non-null  object 
 14  Platform_Location     

In [None]:
merged_data_train.describe(include = 'all')

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
count,94379.0,94302,85428,94346.0,85153,94379,94379.0,94322.0,94022.0,94379.0,...,94349,94361,94288,94306,86778,94289,94237,94302,94373,94373
unique,,2,2,,2,2,,,,,...,6,6,6,6,6,6,5,6,6,6
top,,Female,Loyal Customer,,Business Travel,Eco,,,,,...,Good,Good,Good,Good,Good,Good,Good,Good,Good,Good
freq,,47815,69823,,58617,49342,,,,,...,22835,30446,30016,28909,27265,28870,34944,26502,35427,25533
mean,98847190.0,,,39.419647,,,1978.888185,14.647092,15.005222,0.546658,...,,,,,,,,,,
std,27245.01,,,15.116632,,,1027.961019,38.138781,38.439409,0.497821,...,,,,,,,,,,
min,98800000.0,,,7.0,,,50.0,0.0,0.0,0.0,...,,,,,,,,,,
25%,98823600.0,,,27.0,,,1359.0,0.0,0.0,0.0,...,,,,,,,,,,
50%,98847190.0,,,40.0,,,1923.0,0.0,0.0,1.0,...,,,,,,,,,,
75%,98870780.0,,,51.0,,,2538.0,12.0,13.0,1.0,...,,,,,,,,,,


In [None]:
merged_data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35602 entries, 0 to 35601
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       35602 non-null  int64  
 1   Gender                   35572 non-null  object 
 2   Customer_Type            32219 non-null  object 
 3   Age                      35591 non-null  float64
 4   Type_Travel              32154 non-null  object 
 5   Travel_Class             35602 non-null  object 
 6   Travel_Distance          35602 non-null  int64  
 7   Departure_Delay_in_Mins  35573 non-null  float64
 8   Arrival_Delay_in_Mins    35479 non-null  float64
 9   Seat_Comfort             35580 non-null  object 
 10  Seat_Class               35602 non-null  object 
 11  Arrival_Time_Convenient  32277 non-null  object 
 12  Catering                 32245 non-null  object 
 13  Platform_Location        35590 non-null  object 
 14  Onboard_Wifi_Service  

In [None]:
#Understanding missing data
# summarize the number of rows with missing values for each column
merged_data_train.isnull().sum()/merged_data_train.count()*100
 #seems like most of the missing data is just the catering,service, and arrival times. The rest is >1% missing
 #one would naïvely think that those features correlate with subjective experience, but who knows...
 #missing records amounting to less than 10% can just be imputed using the most common entry. 
 #After attempting most frequent entry imputation, we also utilized KNN imputation via the FancyImpute package

ID                          0.000000
Gender                      0.081653
Customer_Type              10.477829
Age                         0.034978
Type_Travel                10.834615
Travel_Class                0.000000
Travel_Distance             0.000000
Departure_Delay_in_Mins     0.060431
Arrival_Delay_in_Mins       0.379698
Overall_Experience          0.000000
Seat_Comfort                0.064675
Seat_Class                  0.000000
Arrival_Time_Convenient    10.450678
Catering                   10.206917
Platform_Location           0.031797
Onboard_Wifi_Service        0.031797
Onboard_Entertainment       0.019076
Online_Support              0.096513
Ease_of_Online_Booking      0.077408
Onboard_Service             8.759132
Legroom                     0.095451
Baggage_Handling            0.150684
CheckIn_Service             0.081653
Cleanliness                 0.006358
Online_Boarding             0.006358
dtype: float64

In [None]:
#exclude output variable from pipeline, avoiding overfitting
merged_data_train2=merged_data_train.drop('Overall_Experience', axis=1)

#checking balance of data, looking at number of entries for each feature
for i in range(1,23):
  if merged_data_train2.iloc[:,i].dtype == 'object':
    display(merged_data_train2.iloc[:,i].value_counts())

Female    47815
Male      46487
Name: Gender, dtype: int64

Loyal Customer       69823
Disloyal Customer    15605
Name: Customer_Type, dtype: int64

Business Travel    58617
Personal Travel    26536
Name: Type_Travel, dtype: int64

Eco         49342
Business    45037
Name: Travel_Class, dtype: int64

Acceptable           21158
Needs Improvement    20946
Good                 20595
Poor                 15185
Excellent            12971
Extremely Poor        3463
Name: Seat_Comfort, dtype: int64

Green Car    47435
Ordinary     46944
Name: Seat_Class, dtype: int64

Good                 19574
Excellent            17684
Acceptable           15177
Needs Improvement    14990
Poor                 13692
Extremely Poor        4332
Name: Arrival_Time_Convenient, dtype: int64

Acceptable           18468
Needs Improvement    17978
Good                 17969
Poor                 13858
Excellent            13455
Extremely Poor        3910
Name: Catering, dtype: int64

Manageable           24173
Convenient           21912
Needs Improvement    17832
Inconvenient         16449
Very Convenient      13981
Very Inconvenient        2
Name: Platform_Location, dtype: int64

Good                 22835
Excellent            20968
Acceptable           20118
Needs Improvement    19596
Poor                 10741
Extremely Poor          91
Name: Onboard_Wifi_Service, dtype: int64

Good                 30446
Excellent            21644
Acceptable           17560
Needs Improvement    13926
Poor                  8641
Extremely Poor        2144
Name: Onboard_Entertainment, dtype: int64

Good                 30016
Excellent            25894
Acceptable           15702
Needs Improvement    12508
Poor                 10167
Extremely Poor           1
Name: Online_Support, dtype: int64

Good                 28909
Excellent            24744
Acceptable           16390
Needs Improvement    14479
Poor                  9768
Extremely Poor          16
Name: Ease_of_Online_Booking, dtype: int64

Good                 27265
Excellent            21272
Acceptable           18071
Needs Improvement    11390
Poor                  8776
Extremely Poor           4
Name: Onboard_Service, dtype: int64

Good                 28870
Excellent            24832
Acceptable           16384
Needs Improvement    15753
Poor                  8110
Extremely Poor         340
Name: Legroom, dtype: int64

Good                 34944
Excellent            26003
Acceptable           17767
Needs Improvement     9759
Poor                  5764
Name: Baggage_Handling, dtype: int64

Good                 26502
Acceptable           25803
Excellent            19641
Needs Improvement    11218
Poor                 11137
Extremely Poor           1
Name: CheckIn_Service, dtype: int64

Good                 35427
Excellent            26053
Acceptable           17449
Needs Improvement     9806
Poor                  5633
Extremely Poor           5
Name: Cleanliness, dtype: int64

#Imputation of Missing Values

##Imputing with FancyImpute/KNN for Training Data

In [None]:
!pip install fancyimpute
#fancy impures allows easy imputing, scaling

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
Collecting knnimpute>=0.1.0
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 20.8 MB/s 
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29899 sha256=3a836dee9ab3adbfdb53833e7c8d848dc7865432250e5b87a22ebda11c2ab734
  Stored in directory: /root/.cache/pip/wheels/e3/04/06/a1a7d89ef4e631ce6268ea2d8cde04f7290651c1ff1025ce68
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-any.whl size=11353 sha256=8ddc4a0628b5385d0059a652e55a9c72e1ceca7d8a92d945285ed2349f517102
  Stored in dir

In [None]:

from fancyimpute import KNN
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
#instantiate both packages to use
#trying OrdinalEncoder over LabelEncoder (as above) because of this discussion: 
# https://datascience.stackexchange.com/questions/39317/difference-between-ordinalencoder-and-labelencoder


#get labels for each category
num_ix = merged_data_train2.select_dtypes(include=['int64', 'float64']).columns
cat_ix = merged_data_train2.select_dtypes(include=['object']).columns

encoder = LabelEncoder()
imputer = KNNImputer(n_neighbors = 5)

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

In [None]:
#create a for loop to iterate through each column in the data
for i in cat_ix:
    encode(merged_data_train2[i])

In [None]:
merged_data_train2.head(10)

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,0,1,52.0,,0,272,0.0,5.0,4,...,3,4,0,4,4,0,3,3,4,5
1,98800002,1,1,48.0,1.0,1,2200,9.0,0.0,5,...,3,5,3,3,1,4,4,4,3,3
2,98800003,0,1,43.0,0.0,0,1061,77.0,119.0,4,...,4,3,1,1,1,1,1,3,1,1
3,98800004,0,1,44.0,0.0,0,780,13.0,18.0,0,...,0,4,0,0,0,0,0,3,0,0
4,98800005,0,1,50.0,0.0,0,1981,0.0,0.0,0,...,4,3,1,3,3,3,2,3,3,3
5,98800006,1,1,44.0,0.0,0,2810,0.0,0.0,4,...,3,1,3,0,0,3,0,0,0,0
6,98800007,1,1,56.0,1.0,1,2029,0.0,0.0,3,...,3,3,3,3,3,1,3,1,0,3
7,98800008,1,1,65.0,1.0,0,853,0.0,3.0,0,...,1,1,3,3,3,0,2,3,3,1
8,98800009,1,1,22.0,1.0,1,1636,1.0,0.0,4,...,0,4,0,0,0,0,3,0,4,0
9,98800010,1,1,57.0,0.0,0,306,0.0,0.0,4,...,0,3,3,4,4,4,3,0,4,0


In [None]:
#then scale numeric data (should probably make a pipeline)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
knn_encode_data = pd.DataFrame(scaler.fit_transform(merged_data_train2), columns = merged_data_train2.columns)

#Importing KNNImputer to impute values for the dataset
knn_encode_data = pd.DataFrame(imputer.fit_transform(merged_data_train2),columns = merged_data_train2.columns)

In [None]:
knn_encode_data.head()


Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001.0,0.0,1.0,52.0,0.2,0.0,272.0,0.0,5.0,4.0,...,3.0,4.0,0.0,4.0,4.0,0.0,3.0,3.0,4.0,5.0
1,98800002.0,1.0,1.0,48.0,1.0,1.0,2200.0,9.0,0.0,5.0,...,3.0,5.0,3.0,3.0,1.0,4.0,4.0,4.0,3.0,3.0
2,98800003.0,0.0,1.0,43.0,0.0,0.0,1061.0,77.0,119.0,4.0,...,4.0,3.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0
3,98800004.0,0.0,1.0,44.0,0.0,0.0,780.0,13.0,18.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
4,98800005.0,0.0,1.0,50.0,0.0,0.0,1981.0,0.0,0.0,0.0,...,4.0,3.0,1.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0


In [None]:
knn_encode_data.isnull().sum()

ID                         0
Gender                     0
Customer_Type              0
Age                        0
Type_Travel                0
Travel_Class               0
Travel_Distance            0
Departure_Delay_in_Mins    0
Arrival_Delay_in_Mins      0
Seat_Comfort               0
Seat_Class                 0
Arrival_Time_Convenient    0
Catering                   0
Platform_Location          0
Onboard_Wifi_Service       0
Onboard_Entertainment      0
Online_Support             0
Ease_of_Online_Booking     0
Onboard_Service            0
Legroom                    0
Baggage_Handling           0
CheckIn_Service            0
Cleanliness                0
Online_Boarding            0
dtype: int64

In [None]:
#Iterating over each variable, and imputing them one by one to save memory
#initializing a buttload of variables to train then concatenate
#there's gotta be a way to generalize, 
#but I'll do it later (iterate over column names,
#create list of names/tags, create series with that in the name, filling from that col in data)
#will this create the variables above?
#for i in merged_data_train2.columns:
      #vars()['knn_'+str(i)] = pd.Series(merged_data_train2[i])
#check if variable exists, number of missing features to fill
#knn_Seat_Comfort.isnull().sum()

#Matching Impute strategy for test Data

In [None]:
#encode test variables
for i in cat_ix:
    encode(merged_data_test[i])
#scale test vars in cols = num_ix
knn_encode_test = pd.DataFrame(scaler.fit_transform(merged_data_test), columns = merged_data_test.columns)
knn_encode_test.head()
#impute
knn_encode_test = pd.DataFrame(imputer.fit_transform(merged_data_test),columns = merged_data_test.columns)

##Running XGBoost on the KNN imputed datasets

In [None]:
#test novel encoding/imputing method
# Sift data into features and predictors
feature_cols = knn_encode_data.columns
X = knn_encode_data[feature_cols] # Features
y = merged_data_train.Overall_Experience # Target variable

In [None]:
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
  classes = np.unique(y_train, axis = 0)
  classes.sort()
  class_samples = np.bincount(y_train)
  total_samples = class_samples.sum()
  n_classes = len(class_samples)
  weights = total_samples / (n_classes * class_samples * 1.0)
  class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
  class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
  sample_weights = [class_weight_dict[y] for y in y_train]
  return sample_weights

In [None]:
#setting class weights for XGBoost due to class imbalance (0: 45.33%, 1: 54.67%)
largest_class_weight_coef = max(merged_data_train['Overall_Experience'].value_counts().values)/merged_data_train.shape[0]

#pass y_train as numpy array
weight = CreateBalancedSampleWeights(y, largest_class_weight_coef)

In [None]:
#tuning model with gridsearchcv, watch out this takes 3 hours. It'll never finish in Colab
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [5,6,7,8], 
              'max_leaves':[3,4,5],
              'learning_rate': [0.1, 0.01, 0.05],
              'n_estimators': [100,200,500]}  
   
grid = GridSearchCV(XGBClassifier(), param_grid, refit = True, 
    scoring = 'roc_auc', verbose = 4,n_jobs=-1, cv =5) 
   
# fitting the model for grid search 
grid.fit(X, y) 


Fitting 10 folds for each of 108 candidates, totalling 1080 fits


In [None]:
print(grid.best_estimator_)

NameError: ignored

In [None]:
print(grid.best_params_)

In [None]:
#using tuned XGBoost
#using tuned XGBoost
model = XGBClassifier(objective="binary:logistic", random_state=0, booster = 'gbtree', subsample = 1,
                      weights = weight, max_depth=8, max_leaves = 3, n_estimators=500, learning_rate = 0.1,
                      colsample_bytree = .5)
k_i_m = model.fit(X, y)

predictions = k_i_m.predict(knn_encode_test)
predictions.sum()
#Acc .9458 19092
#acc .9377 19269

19103

#Testing AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV

abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())

parameters = {'base_estimator__max_depth':[2,11,2],
              'base_estimator__min_samples_leaf':[5,10],
              'n_estimators':[100,200,500],
              'learning_rate':[0.01,0.1]}

clf = GridSearchCV(abc, parameters,verbose=3,scoring='roc_auc',n_jobs=-1)
clf.fit(X,y)


Fitting 5 folds for each of 80 candidates, totalling 400 fits


In [None]:
print(grid.best_estimator_)

In [None]:
print(grid.best_params_)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                         n_estimators = 500, learning_rate = 0.1)

ada.fit(X,y)

predictions = ada.predict(knn_encode_test)
predictions.sum()

TypeError: ignored

#XGBoost Model

##Discarded method: Imputing test data with most frequent/mean (Accuracy ~ 95%)

In [None]:
#Imputing Training Data with Most Common
#get labels for each category
num_ix = merged_data_train2.select_dtypes(include=['int64', 'float64']).columns
cat_ix = merged_data_train.select_dtypes(include=['object']).columns

#numeric columns, imputed with mean value of column
for i in num_ix:
  merged_data_train2[i].fillna(merged_data_train2[i].mean(), inplace = True)

#categorical columns, imputed with most frequent value
for i in cat_ix:
  merged_data_train2 = merged_data_train2.fillna(merged_data_train2[i].value_counts().index[0])

  #scale numeric data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in num_ix:
  merged_data_train2[i] = scaler.fit_transform(merged_data_train2[i].values.reshape(-1,1))

  #encode categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in cat_ix:
  merged_data_train2[i] = le.fit_transform(merged_data_train2[i].values)

In [None]:
#Imputing Test Data With Most common
#numeric columns, imputed with mean value of column
for i in num_ix:
  merged_data_test[i].fillna(merged_data_test[i].mean(), inplace = True)

#categorical columns, imputed with most frequent value
for i in cat_ix:
  merged_data_test = merged_data_test.fillna(merged_data_test[i].value_counts().index[0])

  #scale numeric data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in num_ix:
  merged_data_test[i] = scaler.fit_transform(merged_data_test[i].values.reshape(-1,1))

  #encode categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in cat_ix:
  merged_data_test[i] = le.fit_transform(merged_data_test[i].values)

In [None]:
# Sift data into features and predictors
feature_cols = merged_data_train.columns.drop('Overall_Experience')
X = merged_data_train2[feature_cols] # Features
y = merged_data_train.Overall_Experience # Target variable

In [None]:
y.isnull().sum()

0

In [None]:
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
  classes = np.unique(y_train, axis = 0)
  classes.sort()
  class_samples = np.bincount(y_train)
  total_samples = class_samples.sum()
  n_classes = len(class_samples)
  weights = total_samples / (n_classes * class_samples * 1.0)
  class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
  class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
  sample_weights = [class_weight_dict[y] for y in y_train]
  return sample_weights

In [None]:
#setting class weights for XGBoost due to class imbalance (0: 45.33%, 1: 54.67%)
from xgboost import XGBClassifier
largest_class_weight_coef = max(merged_data_train['Overall_Experience'].value_counts().values)/merged_data_train.shape[0]

#pass y_train as numpy array
weight = CreateBalancedSampleWeights(y, largest_class_weight_coef)

In [None]:
#XGB Model
model = XGBClassifier(objective="reg:logistic", random_state=0, 
                      weights = weight, max_depth=6, max_leaves = 3, n_estimators=500,)
xgb = model.fit(X, y)

predictions = xgb.predict(merged_data_test)
predictions.sum()
#Acc 0.0.9423347, 19552 sum , depth = 6

19708

##Tune XGBoost




In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [5,6,7,8], 
              'max_leaves':[3,4,5],
              'learning_rate': [0.1, 0.01, 0.05],
              'n_estimators': [100,200,500]}  
   
grid = GridSearchCV(XGBClassifier(), param_grid, refit = True, 
    scoring = 'roc_auc', verbose = 3,n_jobs=-1, cv =10) 
   
# fitting the model for grid search 
grid.fit(X, y) 


Fitting 10 folds for each of 3 candidates, totalling 30 fits


GridSearchCV(cv=10, estimator=XGBClassifier(), n_jobs=-1,
             param_grid={'n_estimators': [100, 200, 500]}, scoring='roc_auc',
             verbose=3)

In [None]:
print(grid.best_estimator_)

XGBClassifier(n_estimators=500)


In [None]:
print(grid.best_params_)

{'learning_rate': 0.1, 'max_depth': 8, 'max_leaves': 3}


In [None]:
#using tuned XGBoost
model = XGBClassifier(objective="reg:logistic", random_state=0, 
                      weights = weight, max_depth=8, max_leaves = 3, n_estimators=500, learning_rate = 0.1)
xgb = model.fit(X, y)

predictions = xgb.predict(merged_data_test)
predictions.sum()

19340

#Junkyard of Discarded Models

##Discarded method: Imputing test data with most frequent/mean (Accuracy ~ 95%)

In [None]:
#Imputing Data Manually
#get labels for each category
num_ix = merged_data_train2.select_dtypes(include=['int64', 'float64']).columns
cat_ix = merged_data_train2.select_dtypes(include=['object']).columns

#numeric columns, imputed with mean value of column
for i in num_ix:
  merged_data_train2[i].fillna(merged_data_train2[i].mean(), inplace = True)

#categorical columns, imputed with most frequent value
for i in cat_ix:
  merged_data_train2 = merged_data_train2.fillna(merged_data_train2[i].value_counts().index[0])

In [None]:
#scale numeric data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for i in num_ix:
  merged_data_train2[i] = scaler.fit_transform(merged_data_train2[i].values.reshape(-1,1))
  #reshape was added before the last submission. not sure if it'll break the code yet

In [None]:
#encode categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in cat_ix:
  merged_data_train2[i] = le.fit_transform(merged_data_train2[i].values)

In [None]:
merged_data_train2

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Seat_Comfort,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,0.000000,0,2,0.576923,1,0,0.032169,0.000000,0.003157,5,...,4,5,0,5,5,0,4,4,5,6
1,0.000011,1,2,0.525641,2,1,0.311549,0.005653,0.000000,6,...,4,6,4,4,1,5,5,5,4,4
2,0.000021,0,2,0.461538,0,0,0.146501,0.048367,0.075126,5,...,5,4,1,1,1,1,1,4,1,1
3,0.000032,0,2,0.474359,0,0,0.105782,0.008166,0.011364,0,...,0,5,0,0,0,0,0,4,0,0
4,0.000042,0,2,0.551282,0,0,0.279815,0.000000,0.000000,0,...,5,4,1,4,4,4,3,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94374,0.999958,1,2,0.320513,0,0,0.189393,0.052136,0.078914,6,...,6,6,6,6,4,4,3,5,4,6
94375,0.999968,1,2,0.474359,0,0,0.078539,0.003141,0.006944,4,...,5,1,1,0,0,0,0,4,0,4
94376,0.999979,1,1,0.717949,0,0,0.397624,0.000000,0.000000,5,...,4,1,4,4,4,4,3,0,4,0
94377,0.999989,1,2,0.115385,2,1,0.390378,0.000000,0.000000,5,...,4,5,4,4,0,4,3,4,1,4


##Basic Logistic Regression

In [None]:
#from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight="balanced", n_jobs = 1000)
#pipeline = Pipeline(steps=[('t', transformer), ('m',model)])

# fit the model with data
log_mod = model.fit(X,y)

#s_t = survey_test.values
predictions = log_mod.predict(merged_data_test)
predictions.sum()

##DTree

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(class_weight="balanced")

D_tree = model.fit(X,y)

predictions = D_tree.predict(merged_data_test)
predictions.sum()

25152

##KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

knn = model.fit(X,y)

predictions = knn.predict(merged_data_test)
predictions.sum()

11401

##SVM Model

In [None]:
#SVM
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
model = SVC()
svm = model.fit(X,y)
predictions = svm.predict(merged_data_test)
predictions.sum()

23244

##Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import CategoricalNB
model = CategoricalNB()

bayes = model.fit(X,y)

predictions = bayes.predict(merged_data_test)
predictions.sum()

IndexError: ignored

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

gsnb = model.fit(X,y)

predictions = gsnb.predict(merged_data_test)
predictions.sum()

22022

#Output CSV

In [None]:
#generating csv
submission_csv = survey_test[['ID']]
submission_csv['Overall_Experience'] = predictions
submission_csv.to_csv('/content/drive/MyDrive/Colab_Notebooks/Hackathon/Submission.csv', index=False)

In [None]:
submission_csv

Unnamed: 0,ID,Overall_Experience
0,99900001,1
1,99900002,1
2,99900003,1
3,99900004,0
4,99900005,1
...,...,...
35597,99935598,0
35598,99935599,1
35599,99935600,1
35600,99935601,1
