In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
# hyperopt imports
from hyperopt import fmin    # for minimization of object functions
from hyperopt import tpe     # for surrogate model
from hyperopt import hp      # for search space optimization
from hyperopt import Trials  # to store the trial results
from hyperopt import STATUS_OK

## load the data set

In [None]:
df= pd.read_csv("/content/drive/MyDrive/end2end_DS/DS_projects/airLinePassenger_satisfection/airline_passenger_satisfaction.csv")\
  .drop(['ID'], axis= 1)\
  .dropna()

In [None]:
df.head()

Unnamed: 0,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Ease of Online Booking,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,Male,48,First-time,Business,Business,821,2,5.0,3,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,Female,35,Returning,Business,Business,821,26,39.0,2,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,Male,41,Returning,Business,Business,853,0,0.0,4,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,Male,50,Returning,Business,Business,1905,0,0.0,2,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,Female,49,Returning,Business,Business,3470,0,1.0,3,3,...,3,4,4,5,4,3,3,3,3,Satisfied


## creating baseline model

In [None]:
# split maadi
# let's split our datasets
#split data into dependent variables(X) and independent variable(y) that we would predict
y = df.pop("Satisfaction").map({'Satisfied': 1, 'Neutral or Dissatisfied': 0})
X = df
#Let’s split X and y using Train test split
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=42, train_size=0.8, shuffle= True)
#get shape of train and test data
print("train size X : ", X_train.shape)
print("train size y : ", y_train.shape)
print("test size  X : ", X_test.shape)
print("test size  y : ", y_test.shape)

train size X :  (103589, 22)
train size y :  (103589,)
test size  X :  (25898, 22)
test size  y :  (25898,)


In [None]:
# let's build the pipeline
# our pipeline will have 2 things, preprocessing and model build

# preprocessing steps:
# standardization of neumerical cols
# transforming age bucket
# one hot encoding of categorical cols


class getEncoded(BaseEstimator, TransformerMixin):
  def __init__(self):
    print('\n>>>>>>>init() called.\n')
  def fit(self, X, y = None):
    print('\n>>>>>>>fit() called.\n')
    return self
  def transform(self,  X, y = None):
    print('\n>>>>>>>transform() called.\n')
    return pd.get_dummies(X, drop_first= True)


  


numeric_transformer        = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer    = Pipeline(steps=[('x_transformer', getEncoded())])


numeric_features     = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns


preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
]) 


>>>>>>>init() called.



In [None]:
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', LogisticRegression())
           ])

In [None]:
model = pipeline.fit(X_train, y_train)
print(model)


>>>>>>>init() called.


>>>>>>>fit() called.


>>>>>>>transform() called.

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('x_transformer',
                                                                   getEncoded())]),
                                                  Index(['Gender', 'Customer Type', 'Type of Travel', 'Class'], dtype='object'))])),
                ('regressor', LogisticRegression())])


In [None]:
# training report
print(classification_report(y_train, model.predict(X_train)))
print(confusion_matrix(y_train, model.predict(X_train)))
print(confusion_matrix(y_train, y_train))


>>>>>>>transform() called.

              precision    recall  f1-score   support

           0       0.80      0.81      0.81     58540
           1       0.75      0.75      0.75     45049

    accuracy                           0.78    103589
   macro avg       0.78      0.78      0.78    103589
weighted avg       0.78      0.78      0.78    103589


>>>>>>>transform() called.

[[47233 11307]
 [11486 33563]]
[[58540     0]
 [    0 45049]]


In [None]:
# test report
print(classification_report(y_test, model.predict(X_test)))
print(confusion_matrix(y_test, model.predict(X_test)))
print(confusion_matrix(y_test, y_test))


>>>>>>>transform() called.

              precision    recall  f1-score   support

           0       0.81      0.81      0.81     14685
           1       0.75      0.74      0.75     11213

    accuracy                           0.78     25898
   macro avg       0.78      0.78      0.78     25898
weighted avg       0.78      0.78      0.78     25898


>>>>>>>transform() called.

[[11846  2839]
 [ 2860  8353]]
[[14685     0]
 [    0 11213]]


## hyper parameter

In [None]:
# defining search space
space = {
            'warm_start' : hp.choice('warm_start', [True, False]),
            #'fit_intercept' : hp.choice('fit_intercept', [True, False]),
            'tol' : hp.uniform('tol', 0.00001, 0.0001),
            'C' : hp.uniform('C', 0.001, 5),
            #'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'saga']),
            'solver': 'saga',
            'penalty': hp.choice('penalty', ['l1', 'l2']),
            #'max_iter' : hp.choice('max_iter', range(100,1000))
            #'multi_class' : 'auto',
            #'class_weight' : 'balanced'
                }

In [None]:
def obj(space):
  clf= LogisticRegression(**space)
  pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', clf)
           ])
  
  print(clf.get_params())
  model = pipeline.fit(X_train, y_train)
  pred = model.predict(X_test)
  f1Score = f1_score(y_test, pred, average= 'micro')
  print("SCORE:", f1Score)
  print(classification_report(y_test, pred))
  return {'loss': -f1Score, 'status': STATUS_OK }


In [None]:
trials = Trials()

best_hyperparams = fmin(fn = obj,
                        space = space,
                        algo = tpe.suggest,      #surrogate model
                        max_evals = 10,         #10 runs
                        trials = trials)

{'C': 1.197945052231733, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'saga', 'tol': 7.610246451297854e-05, 'verbose': 0, 'warm_start': False}

>>>>>>>init() called.


>>>>>>>fit() called.


>>>>>>>transform() called.


>>>>>>>transform() called.

SCORE:
0.8718433855896208
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     14685
           1       0.87      0.83      0.85     11213

    accuracy                           0.87     25898
   macro avg       0.87      0.87      0.87     25898
weighted avg       0.87      0.87      0.87     25898

{'C': 1.885802244205855, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'saga

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)
print(trials.best_trial['result']['loss'])


The best hyperparameters are :  

{'C': 1.197945052231733, 'penalty': 1, 'tol': 7.610246451297854e-05, 'warm_start': 1}
-0.8718433855896208


## performing grid search

In [None]:
params= {
    'C': [0.8, 1.19, 1.5],
    'tol': [7.610246451297854e-07, 7.610246451297854e-05, 7.610246451297854e-01]
}
clf= LogisticRegression(penalty= 'l1', warm_start= True, solver= 'saga')
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score, average = 'micro'),
           'recall': make_scorer(recall_score, average = 'micro'),
           'f1': make_scorer(f1_score, average = 'micro')}

folds = StratifiedKFold(n_splits= 3, shuffle= True, random_state= 42)
logit= GridSearchCV(estimator= clf, param_grid= params, cv= folds, return_train_score= True, scoring = scoring, refit='f1', verbose= 1000)
X_train1= preprocessor.fit_transform(X_train)
model = logit.fit(X_train1, y_train)


>>>>>>>init() called.


>>>>>>>fit() called.


>>>>>>>transform() called.

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3; 1/9] START C=0.8, tol=7.610246451297854e-07............................
[CV 1/3; 1/9] END C=0.8, tol=7.610246451297854e-07; accuracy: (train=0.874, test=0.876) f1: (train=0.874, test=0.876) precision: (train=0.874, test=0.876) recall: (train=0.874, test=0.876) total time=   2.4s
[CV 2/3; 1/9] START C=0.8, tol=7.610246451297854e-07............................
[CV 2/3; 1/9] END C=0.8, tol=7.610246451297854e-07; accuracy: (train=0.875, test=0.874) f1: (train=0.875, test=0.874) precision: (train=0.875, test=0.874) recall: (train=0.875, test=0.874) total time=   4.0s
[CV 3/3; 1/9] START C=0.8, tol=7.610246451297854e-07............................
[CV 3/3; 1/9] END C=0.8, tol=7.610246451297854e-07; accuracy: (train=0.876, test=0.873) f1: (train=0.876, test=0.873) precision: (train=0.876, test=0.873) recall: (train=0.876, test=0.873) total time=   3.

## final model

In [None]:
print("Best parameters:", model.best_params_)

Best parameters: {'C': 1.5, 'tol': 7.610246451297854e-07}


In [None]:
clf= LogisticRegression(penalty= 'l1', warm_start= True, solver= 'saga', C= 1.5, tol= 7.610246451297854e-07)
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', clf)
           ])
model = pipeline.fit(X_train, y_train)


>>>>>>>init() called.


>>>>>>>fit() called.


>>>>>>>transform() called.



In [None]:
print(classification_report(y_test, model.predict(X_test)))


>>>>>>>transform() called.

              precision    recall  f1-score   support

           0       0.88      0.90      0.89     14685
           1       0.87      0.83      0.85     11213

    accuracy                           0.87     25898
   macro avg       0.87      0.87      0.87     25898
weighted avg       0.87      0.87      0.87     25898

