In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

# Load Data
train = pd.read_csv("train.csv", index_col="id")
test = pd.read_csv("test.csv", index_col="id")

# Function for preprocessing
def flightDataPreprocess(trainData, testData, targetCol, dropFirst=True, binDistance=[0, 2400, 5000], binAge=[0, 14, 24, 44, 54, 64, 100]):
    
    """
    ## Meaning of parameters
    trainData: training data in dataframe
    testData: testing data in dataframe
    indexCol: column name to use as index of dataframe
    targetCol: column name to us as target
    dropFirst: drop first column when doing one-hot encoding
    binDistance: bin margin of Flight distance when doing k-bins discretization
    binAge: bin margin of Age distance when doing k-bins discretization

    ## Return variables
    trainCopy: training data after pre-processing
    testCopy: testing data after pre-processing
    trainTarget: targets of training data
    testTarget: targets of testing data
    """

    trainCopy = trainData.copy()
    testCopy = testData.copy()

    trainTarget = trainCopy.loc[:, targetCol]
    testTarget = testCopy.loc[:, targetCol]

    trainCopy.drop(targetCol, axis=1, inplace=True)
    testCopy.drop(targetCol, axis=1, inplace=True)

    # Standardize numerical variables
    numVar = ["Departure Delay in Minutes", "Arrival Delay in Minutes"]
    trainCopy[numVar] = StandardScaler().fit_transform(trainCopy[numVar])
    testCopy[numVar] = StandardScaler().fit_transform(testCopy[numVar])

    # Convert categorial variables into ordinal
    catVar = ["Inflight wifi service", "Departure/Arrival time convenient", "Ease of Online booking",\
        "Gate location", "Food and drink", "Online boarding", "Seat comfort", "Inflight entertainment",\
        "On-board service", "Leg room service", "Baggage handling", "Checkin service", "Inflight service", "Cleanliness"]

    ptsType = CategoricalDtype(categories=[0, 1, 2, 3, 4, 5], ordered=True)  # Set the order of categories

    for var in catVar:
        trainCopy[var] = trainCopy[var].astype(ptsType)
        testCopy[var] = testCopy[var].astype(ptsType)

    # Discretize continuous variables
    binA = binAge # Can cutomize the bin edge of Age
    binD = binDistance  # Can cutomize the bin edge of Flight Distance

    trainCopy["Age"] = pd.cut(trainCopy["Age"], binA)
    testCopy["Age"] = pd.cut(testCopy["Age"], binA)

    trainCopy["Flight Distance"] = pd.cut(trainCopy["Flight Distance"], binD)
    testCopy["Flight Distance"] = pd.cut(testCopy["Flight Distance"], binD)

    # One-hot encoding nominal variables
    nomiVar = ["Gender", "Customer Type", "Type of Travel", "Class"]

    trainDummy = pd.get_dummies(trainCopy[nomiVar], drop_first=dropFirst)
    trainCopy = trainCopy.merge(trainDummy, left_index=True, right_index=True)
    trainCopy.drop(nomiVar, axis=1, inplace=True)


    testDummy = pd.get_dummies(testCopy[nomiVar], drop_first=dropFirst)
    testCopy = testCopy.merge(testDummy, left_index=True, right_index=True)
    testCopy.drop(nomiVar, axis=1, inplace=True)

    return trainCopy, testCopy, trainTarget, testTarget

# trainFil, testFil, trainTarget, testTarget = flightDataPreprocess(trainData = train, testData = test, targetCol="satisfaction")


In [None]:
thresD = dict()
for d in range(500, 4000, 500):
    print(d)
    trainFilCopy = trainFil.copy()
    testFilCopy = testFil.copy()

    # Binarize flight distance on different threshold
    dist = trainFilCopy.loc[:, "Flight Distance"]
    binarizer = pre.Binarizer(threshold=d)
    distBi = binarizer.transform([dist]).T
    trainFilCopy.loc[:, "Flight Distance"] = distBi

    dist = testFilCopy.loc[:, "Flight Distance"]
    binarizer = pre.Binarizer(threshold=d)
    distBi = binarizer.transform([dist]).T
    testFilCopy.loc[:, "Flight Distance"] = distBi
    
    # KNeighborsClassifier
    kNeighbor = []
    accTrend = []
    for n in range(1, 51):
        print(n)
        knn = KNeighborsClassifier(n_neighbors=n).fit(trainFilCopy, trainTarget)
        pred = knn.predict(testFilCopy)
        eva = classification_report(y_true=testTarget, y_pred=pred, output_dict=True)
        kNeighbor.append(n)
        accTrend.append(eva["accuracy"])

    dictn = {"x":kNeighbor, "y":accTrend}
    thresD[str(d)] = dictn

In [None]:
thresD = dict()
for d in range(500, 4000, 500):
    print(d)
    trainFilCopy = trainFil.copy()
    testFilCopy = testFil.copy()

    # Binarize flight distance on different threshold
    dist = trainFilCopy.loc[:, "Flight Distance"]
    binarizer = pre.Binarizer(threshold=d)
    distBi = binarizer.transform([dist]).T
    trainFilCopy.loc[:, "Flight Distance"] = distBi

    dist = testFilCopy.loc[:, "Flight Distance"]
    binarizer = pre.Binarizer(threshold=d)
    distBi = binarizer.transform([dist]).T
    testFilCopy.loc[:, "Flight Distance"] = distBi
    
    # KNeighborsClassifier
    kNeighbor = []
    accTrend = []
    for n in range(1, 51):
        print(n)
        knn = KNeighborsClassifier(n_neighbors=n).fit(trainFilCopy, trainTarget)
        pred = knn.predict(testFilCopy)
        eva = classification_report(y_true=testTarget, y_pred=pred, output_dict=True)
        kNeighbor.append(n)
        accTrend.append(eva["accuracy"])

    dictn = {"x":kNeighbor, "y":accTrend}
    thresD[str(d)] = dictn

### Distance Threshold - KNeighbors - Accuracy

In [8]:
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

'''
thresD = dict()
for d in range(500, 4000, 500):
    print(d)
    trainFilCopy = trainFil.copy()
    testFilCopy = testFil.copy()

    # Binarize flight distance on different threshold
    dist = trainFilCopy.loc[:, "Flight Distance"]
    binarizer = pre.Binarizer(threshold=d)
    distBi = binarizer.transform([dist]).T
    trainFilCopy.loc[:, "Flight Distance"] = distBi

    dist = testFilCopy.loc[:, "Flight Distance"]
    binarizer = pre.Binarizer(threshold=d)
    distBi = binarizer.transform([dist]).T
    testFilCopy.loc[:, "Flight Distance"] = distBi
    
    # KNeighborsClassifier
    kNeighbor = []
    accTrend = []
    for n in range(1, 51):
        print(n)
        knn = KNeighborsClassifier(n_neighbors=n).fit(trainFilCopy, trainTarget)
        pred = knn.predict(testFilCopy)
        eva = classification_report(y_true=testTarget, y_pred=pred, output_dict=True)
        kNeighbor.append(n)
        accTrend.append(eva["accuracy"])

    dictn = {"x":kNeighbor, "y":accTrend}
    thresD[str(d)] = dictn
'''

'\nthresD = dict()\nfor d in range(500, 4000, 500):\n    print(d)\n    trainFilCopy = trainFil.copy()\n    testFilCopy = testFil.copy()\n\n    # Binarize flight distance on different threshold\n    dist = trainFilCopy.loc[:, "Flight Distance"]\n    binarizer = pre.Binarizer(threshold=d)\n    distBi = binarizer.transform([dist]).T\n    trainFilCopy.loc[:, "Flight Distance"] = distBi\n\n    dist = testFilCopy.loc[:, "Flight Distance"]\n    binarizer = pre.Binarizer(threshold=d)\n    distBi = binarizer.transform([dist]).T\n    testFilCopy.loc[:, "Flight Distance"] = distBi\n    \n    # KNeighborsClassifier\n    kNeighbor = []\n    accTrend = []\n    for n in range(1, 51):\n        print(n)\n        knn = KNeighborsClassifier(n_neighbors=n).fit(trainFilCopy, trainTarget)\n        pred = knn.predict(testFilCopy)\n        eva = classification_report(y_true=testTarget, y_pred=pred, output_dict=True)\n        kNeighbor.append(n)\n        accTrend.append(eva["accuracy"])\n\n    dictn = {"x":kNe

In [9]:
'''
for d in range(500, 4000, 500):
    plt.plot(thresD[str(d)]["x"], thresD[str(d)]["y"], label = str(d))
    
plt.rcParams["figure.figsize"] = (8, 6)  
plt.xlabel("K-Neighbors")
plt.ylabel("Accuracy")
plt.legend()
plt.show()
'''

'\nfor d in range(500, 4000, 500):\n    plt.plot(thresD[str(d)]["x"], thresD[str(d)]["y"], label = str(d))\n    \nplt.rcParams["figure.figsize"] = (8, 6)  \nplt.xlabel("K-Neighbors")\nplt.ylabel("Accuracy")\nplt.legend()\nplt.show()\n'

In [11]:
import numpy as np
a = [True, True, False]
b = np.invert(a)
print(b)

[False False  True]
