In [1]:
# Read the data
import pandas as pd
import glob

path = r'/home/lstefan/ProjectData/HoustonData'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0).dropna()
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [2]:
cancelled = 0
diverted = 0
delayed = 0
onTime = 0
for i in df["LABEL"]:
    if (i == 0):
        onTime+=1
    elif (i == 1):
        delayed+=1
    elif (i == 2):
        diverted+=1
    elif (i == 3):
        cancelled+=1
print("On Time: ", onTime)
print("Delayed: ", delayed)
print("Diverted: ", diverted)
print("Cancelled: ", cancelled)

On Time:  419741
Delayed:  109000
Diverted:  1482
Cancelled:  5378


In [3]:
# Separates the X predicters and Y output and then into training and test sets
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Unnamed: 0','LABEL'])
y = df['LABEL'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Uses OneHotEncoder
import category_encoders as ce

ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X_train_ohe = ohe.fit_transform(X_train)
X_train_ohe.head()

Unnamed: 0,UNIQUE_CARRIER_CO,UNIQUE_CARRIER_XE,UNIQUE_CARRIER_EV,UNIQUE_CARRIER_UA,UNIQUE_CARRIER_US,UNIQUE_CARRIER_OO,UNIQUE_CARRIER_AS,UNIQUE_CARRIER_AA,UNIQUE_CARRIER_9E,UNIQUE_CARRIER_DL,...,DEST_STATE_ABR_PR,DEST_STATE_ABR_WV,DEST_STATE_ABR_SD,DEST_STATE_ABR_AK,DEST_STATE_ABR_WY,YEAR,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DISTANCE
370483,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2011,6,2,1750,305.0
492609,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2011,2,1,1530,668.0
162845,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2010,4,6,1535,657.0
48600,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2011,11,2,1530,305.0
386569,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2010,8,3,1645,1208.0


In [5]:
# Store X_train_ohe to csv file

#X_train_ohe.to_csv('/home/lstefan/ProjectData/HoustonData/AggregatedData/X_train_ohe.csv', index=False)

In [6]:
# Transforms the test set to numerical dummy columns
X_test_ohe = ohe.transform(X_test)
X_test_ohe.head()

Unnamed: 0,UNIQUE_CARRIER_CO,UNIQUE_CARRIER_XE,UNIQUE_CARRIER_EV,UNIQUE_CARRIER_UA,UNIQUE_CARRIER_US,UNIQUE_CARRIER_OO,UNIQUE_CARRIER_AS,UNIQUE_CARRIER_AA,UNIQUE_CARRIER_9E,UNIQUE_CARRIER_DL,...,DEST_STATE_ABR_PR,DEST_STATE_ABR_WV,DEST_STATE_ABR_SD,DEST_STATE_ABR_AK,DEST_STATE_ABR_WY,YEAR,MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DISTANCE
290135,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,2012,8,4,1731,643.0
63158,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2011,7,4,855,1208.0
48209,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2011,11,4,1526,140.0
295876,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,2012,8,3,1755,1346.0
19603,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2011,5,4,1740,1635.0


In [7]:
# Store X_test_ohe to csv file

#X_test_ohe.to_csv('/home/lstefan/ProjectData/HoustonData/AggregatedData/X_test_ohe.csv', index=False)

In [35]:
# Fits the training data to the KNN classifier
from sklearn.neighbors import KNeighborsClassifier  

classifier = KNeighborsClassifier(n_neighbors=655)  
classifier.fit(X_train_ohe, y_train)  

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=655, p=2,
           weights='uniform')

In [36]:
# Gets the predicted values for y 
y_pred = classifier.predict(X_test_ohe)
cancelled = 0
diverted = 0
delayed = 0
onTime = 0
for i in y_pred:
    if (i == 0):
        onTime+=1
    elif (i == 1):
        delayed+=1
    elif (i == 2):
        diverted+=1
    elif (i == 3):
        cancelled+=1
print("On Time: ", onTime)
print("Delayed: ", delayed)
print("Diverted: ", diverted)
print("Cancelled: ", cancelled)

On Time:  107121
Delayed:  0
Diverted:  0
Cancelled:  0


In [31]:
# Outputs a confusion matrix and classification report
from sklearn.metrics import classification_report, confusion_matrix 

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[67164 15792   206   835]
 [15445  6019    68   221]
 [  227    67     1     2]
 [  791   248     1    34]]
             precision    recall  f1-score   support

          0       0.80      0.80      0.80     83997
          1       0.27      0.28      0.27     21753
          2       0.00      0.00      0.00       297
          3       0.03      0.03      0.03      1074

avg / total       0.69      0.68      0.68    107121



In [32]:
# Check the test set score 
print("Test set score: {:.2f}".format(classifier.score(X_test_ohe, y_test)))

Test set score: 0.68
