In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import set_config
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import pickle
from os.path import join

In [2]:
df = pd.read_csv("Airline Passenger Satisfaction.csv")

In [3]:
y = df["Satisfaction"]

df.drop(columns=["Satisfaction", "id"], inplace=True)

X = df.copy()

columns_cat = df.select_dtypes(include="object").columns

columns_num = ["Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes"]
columns_note = [x for x in df.select_dtypes(exclude="object").columns if x not in columns_num]

In [4]:
pipe_cat = OneHotEncoder()

pipe_num = Pipeline(
    (
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    )
)

In [5]:
col_trans = ColumnTransformer(
    [
        ('cat', pipe_cat, columns_cat),
        ('num', pipe_num, columns_num),
        ('notes', 'passthrough', columns_note)
    ]
)

pipeline = Pipeline(
    (
        ('preparation', col_trans),
        ('model', RandomForestClassifier())
    )
)

set_config(display='diagram')
pipeline

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)

pipeline.fit(X_train, y_train)

prediction = pipeline.predict(X_test)

score = accuracy_score(y_test, prediction)

score

0.9580381890976286

# GridSearch

In [8]:
list_tests = {
    'RandomForestClassifier':   [
            {
                'model':                (RandomForestClassifier(),),
                'model__n_estimators':  [10, 50, 100, 150, 200],
                'model__criterion':     ["gini", "entropy", "log_loss"]
            }
        ],

    'KNeighborsClassifier':   [
            {
                'model':                (KNeighborsClassifier(),),
                'model__n_neighbors':   [3, 5, 7, 10],
                'model__weights':       ["uniform", "distance"]
            }
        ],

    'LogisticRegression':   [
            {
                'model':                (LogisticRegression(),),
                'model__max_iter':      [20, 50, 100, 150, 200],
                'model__solver':       ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
            }
        ],
}

metrics = ["accuracy", "precision", "recall", "roc_auc"]

for name, param_gridSearch in list_tests.items():
    gs = GridSearchCV(pipeline, param_gridSearch,
                    #scoring = metrics, refit = 'recall',
                    cv = 5, n_jobs = -1, verbose = 1)

    gs.fit(X_train, y_train)

    print(gs.best_score_)
    print(gs.best_params_)

    pred = gs.predict(X_test)

    cm = confusion_matrix(y_test, pred)

    pickle.dump(gs, open(join("models", name + ".mdl"), 'wb'))
    pickle.dump(cm, open(join("confusion_matrix", name + ".cm"), 'wb'))

Fitting 5 folds for each of 15 candidates, totalling 75 fits
0.9587984887913319
{'model': RandomForestClassifier(criterion='log_loss', n_estimators=200), 'model__criterion': 'log_loss', 'model__n_estimators': 200}
Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.928039333979237
{'model': KNeighborsClassifier(n_neighbors=10, weights='distance'), 'model__n_neighbors': 10, 'model__weights': 'distance'}
Fitting 5 folds for each of 30 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8348475371673587
{'model': LogisticRegression(max_iter=50), 'model__max_iter': 50, 'model__solver': 'lbfgs'}


forest = gs.best_estimator_.named_steps['model']
feature_names = gs.best_estimator_.named_steps['preparation'].get_feature_names_out()

feature_importance = pd.Series({feature_names[i] : forest.feature_importances_[i] for i in range(len(forest.feature_importances_))})

feature_importance.sort_values(ascending=False)