In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline

In [2]:
# Reading the dataset
c_booking = pd.read_csv("customer_booking.csv", header=None, encoding="latin-1")
c_booking

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
1,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
2,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
3,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
4,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49996,2,Internet,RoundTrip,27,6,9,Sat,PERPNH,Australia,1,0,1,5.62,0
49997,1,Internet,RoundTrip,111,6,4,Sun,PERPNH,Australia,0,0,0,5.62,0
49998,1,Internet,RoundTrip,24,6,22,Sat,PERPNH,Australia,0,0,1,5.62,0
49999,1,Internet,RoundTrip,15,6,11,Mon,PERPNH,Australia,1,0,1,5.62,0


In [3]:
# Chgecking the dataset
c_booking.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
1,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
2,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
3,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
4,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0


In [4]:
# checking the last 5 rows
c_booking.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
49996,2,Internet,RoundTrip,27,6,9,Sat,PERPNH,Australia,1,0,1,5.62,0
49997,1,Internet,RoundTrip,111,6,4,Sun,PERPNH,Australia,0,0,0,5.62,0
49998,1,Internet,RoundTrip,24,6,22,Sat,PERPNH,Australia,0,0,1,5.62,0
49999,1,Internet,RoundTrip,15,6,11,Mon,PERPNH,Australia,1,0,1,5.62,0
50000,1,Internet,RoundTrip,19,6,10,Thu,PERPNH,Australia,0,1,0,5.62,0


In [5]:
# checking the columns
c_booking.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')

In [6]:
# Printing summary statistics
c_booking.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001.0,50001
unique,10,3,4,471,336,25,8,800,105,3,3,3,22.0,3
top,1,Internet,RoundTrip,1,6,8,Mon,AKLKUL,Australia,1,0,0,8.83,0
freq,31352,44382,49497,685,7757,3165,8102,2680,17872,33439,35152,28643,14520.0,42522


In [7]:
# Printing the data information
c_booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50001 entries, 0 to 50000
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       50001 non-null  object
 1   1       50001 non-null  object
 2   2       50001 non-null  object
 3   3       50001 non-null  object
 4   4       50001 non-null  object
 5   5       50001 non-null  object
 6   6       50001 non-null  object
 7   7       50001 non-null  object
 8   8       50001 non-null  object
 9   9       50001 non-null  object
 10  10      50001 non-null  object
 11  11      50001 non-null  object
 12  12      50001 non-null  object
 13  13      50001 non-null  object
dtypes: object(14)
memory usage: 5.3+ MB


# Splitting the data into Train and Test sets
Now, we will split our data into train set and test set to prepare our data for two different phases of machine learning modeling: training and testing. Ideally, no information from the test data should be used to preprocess the training data or should be used to direct the training process of a machine learning model. Hence, we first split the data and then preprocess it.

In [8]:
# Import train_test_split
from sklearn.model_selection import train_test_split
print(c_booking.isnull().sum())
# Split into train and test sets
c_booking_train, c_booking_test = train_test_split(c_booking, test_size=0.33, random_state=42)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64


# Checking Missing Values


In [9]:
print(c_booking.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64


In [10]:
#Since our dataset had been cleaned, there are no missing values present in the dataset

In [11]:
# Printing and checking if there are missing values in the train and test dataset
print(c_booking_train.isnull().sum())
print(c_booking_test.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64


# Preprocessing the data
The missing values are now successfully handled.

There is still some minor but essential data preprocessing needed before we proceed towards building our machine learning model. We are going to divide these remaining preprocessing steps into two main tasks:

Convert the non-numeric data into numeric. Scale the feature values to a uniform range. First, we will be converting all the non-numeric values into numeric ones. We do this because not only it results in a faster computation but also many machine learning models (like XGBoost) (and especially the ones developed using scikit-learn) require the data to be in a strictly numeric format. We will do this by using the get_dummies() method from pandas.

In [12]:
# Convert the categorical features in the train and test sets independently
c_booking_train = pd.get_dummies(c_booking_train)
c_booking_test = pd.get_dummies(c_booking_test)

# Reindex the columns of the test set aligning with the train set
c_booking_test = c_booking_test.reindex(columns=c_booking_train.columns, fill_value=0)

In [13]:
c_booking.describe().round()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
count,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001,50001.0,50001
unique,10,3,4,471,336,25,8,800,105,3,3,3,22.0,3
top,1,Internet,RoundTrip,1,6,8,Mon,AKLKUL,Australia,1,0,0,8.83,0
freq,31352,44382,49497,685,7757,3165,8102,2680,17872,33439,35152,28643,14520.0,42522


# Sentiment analysis using textbox

In [15]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Segregate features and labels into separate variables
X_train, y_train = c_booking_train.iloc[:, :-1].values, c_booking_train.iloc[:, [-1]].values
X_test, y_test = c_booking_test.iloc[:, :-1].values, c_booking_test.iloc[:, [-1]].values

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# Fitting logistic regression using train and test data
Checking customer review while booking flight as either satisfied or not satisfied using logistic regression

In [17]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Reshape y_train to a 1-dimensional array
y_train = y_train.ravel()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

# Making prediction and evaluating performance
But how well does our model perform?

We will now evaluate our model on the test set with respect to classification accuracy. But we will also take a look the model's confusion matrix. In the case of predicting customer's booking satisfaction, it is important to see if our machine learning model is equally capable of predicting satisfied and unsatisfied, in line with the frequency of these labels in our original dataset. If our model is not performing well in this aspect, then it might end up predicting customer satisfied in place where the customer is not satisfied. The confusion matrix helps us to view our model's performance from these aspects.

In [18]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test,y_pred)

Accuracy of logistic regression classifier:  0.9999393976122659


array([[14018,     1],
       [    0,  2482]])

# Grid searching and making the model perform better
Our model was pretty good! In fact it was able to yield an accuracy score of 90%.

For the confusion matrix, the first element of the of the first row of the confusion matrix denotes the true negatives meaning the number of negative instances (Unsatisfied customers) predicted by the model correctly. And the last element of the second row of the confusion matrix denotes the true positives meaning the number of positive instances (satisfied customers) predicted by the model correctly.

But if we hadn't got a perfect score what's to be done?. We can perform a grid search of the model parameters to improve the model's ability to predict customer satisfaction.

scikit-learn's implementation of logistic regression consists of different hyperparameters but we will grid search over the following two:

.tol .max_iter

In [19]:
# Import GridSearchCV
# ... YOUR CODE FOR TASK 11 ...
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 1000, 10000]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

# Finding the best performing model
We have defined the grid of hyperparameter values and converted them into a single dictionary format which GridSearchCV() expects as one of its parameters. Now, we will begin the grid search to see which values perform best.

We will instantiate GridSearchCV() with our earlier logreg model with all the data we have. We will also instruct GridSearchCV() to perform a cross-validation of five folds.

We'll end the notebook by storing the best-achieved score and the respective best parameters.

While building this credit card predictor, we tackled some of the most widely-known preprocessing steps such as scaling, label encoding, and missing value imputation. We finished with some machine learning to predict if a customer will be satisfied with booking at British Airline.

In [20]:
#Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX_train, y_train)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
print("Accuracy of logistic regression classifier: ", best_model.score(rescaledX_test,y_test))

Best: 1.000000 using {'max_iter': 100, 'tol': 0.01}
Accuracy of logistic regression classifier:  0.9999393976122659
