In [2]:
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

# Load Data
train = pd.read_csv("train.csv", index_col="id")
test = pd.read_csv("test.csv", index_col="id")

def flightDataPreprocess(trainData, testData, targetCol, dropFirst=True, binDistance=[0, 2400, 5000], binAge=[0, 14, 24, 44, 54, 64, 100]):
    
    """
    ## Meaning of parameters
    trainData: training data in dataframe
    testData: testing data in dataframe
    indexCol: column name to use as index of dataframe
    targetCol: column name to us as target
    dropFirst: drop first column when doing one-hot encoding
    binDistance: bin margin of Flight distance when doing k-bins discretization
    binAge: bin margin of Age distance when doing k-bins discretization

    ## Return variables
    trainCopy: training data after pre-processing
    testCopy: testing data after pre-processing
    trainTarget: targets of training data
    testTarget: targets of testing data
    """

    trainCopy = trainData.copy()
    testCopy = testData.copy()

    trainTarget = trainData.loc[:, targetCol]
    testTarget = testData.loc[:, targetCol]

    trainCopy.drop(targetCol, axis=1, inplace=True)
    testCopy.drop(targetCol, axis=1, inplace=True)

    # Standardize numerical variables
    numVar = ["Departure Delay in Minutes", "Arrival Delay in Minutes"]
    trainCopy[numVar] = StandardScaler().fit_transform(trainCopy[numVar])
    testCopy[numVar] = StandardScaler().fit_transform(testCopy[numVar])

    # Convert categorial variables into ordinal
    catVar = ["Inflight wifi service", "Departure/Arrival time convenient", "Ease of Online booking",\
        "Gate location", "Food and drink", "Online boarding", "Seat comfort", "Inflight entertainment",\
        "On-board service", "Leg room service", "Baggage handling", "Checkin service", "Inflight service", "Cleanliness"]

    ptsType = CategoricalDtype(categories=[0, 1, 2, 3, 4, 5], ordered=True)  # Set the order of categories

    for var in catVar:
        trainCopy[var] = trainCopy[var].astype(ptsType)
        testCopy[var] = testCopy[var].astype(ptsType)

    # Discretize continuous variables
    binA = binAge # Can cutomize the bin edge of Age
    binD = binDistance  # Can cutomize the bin edge of Flight Distance

    trainCopy["Age"] = pd.cut(trainCopy["Age"], binA)
    testCopy["Age"] = pd.cut(testCopy["Age"], binA)

    trainCopy["Flight Distance"] = pd.cut(trainCopy["Flight Distance"], binD)
    testCopy["Flight Distance"] = pd.cut(testCopy["Flight Distance"], binD)

    # One-hot encoding nominal variables
    nomiVar = ["Gender", "Customer Type", "Type of Travel", "Class"]

    trainDummy = pd.get_dummies(trainCopy[nomiVar], drop_first=dropFirst)
    trainCopy = trainCopy.merge(trainDummy, left_index=True, right_index=True)
    trainCopy.drop(nomiVar, axis=1, inplace=True)

    testDummy = pd.get_dummies(testCopy[nomiVar], drop_first=dropFirst)
    testCopy = testCopy.merge(testDummy, left_index=True, right_index=True)
    testCopy.drop(nomiVar, axis=1, inplace=True)

    return trainCopy, testCopy, trainTarget, testTarget

trainFil, testFil, trainTarget, testTarget = flightDataPreprocess(trainData = train, testData = test, targetCol="satisfaction")

# Check the type for each column
#trainFil.dtypes
#trainTarget.dtypes

Age                                  category
Flight Distance                      category
Inflight wifi service                category
Departure/Arrival time convenient    category
Ease of Online booking               category
Gate location                        category
Food and drink                       category
Online boarding                      category
Seat comfort                         category
Inflight entertainment               category
On-board service                     category
Leg room service                     category
Baggage handling                     category
Checkin service                      category
Inflight service                     category
Cleanliness                          category
Departure Delay in Minutes            float64
Arrival Delay in Minutes              float64
Gender_Male                             uint8
Customer Type_disloyal Customer         uint8
Type of Travel_Personal Travel          uint8
Class_Eco                         

In [None]:
'''
import seaborn as sn
import matplotlib.pyplot as plt

# correlation of all variables
train4Corr = train.copy()
train4Corr["Satisfaction"] = trainTarget
train4Corr["Satisfaction"] = pd.Categorical(train4Corr["Satisfaction"])

# train4Corr.dtypes
corrMatrix = train4Corr.corr()
#sn.set(rc = {"figure.figsize":(20, 20)})
sn.heatmap(corrMatrix, annot=True)
plt.show()
'''