In [30]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [10]:

# train_file = '../../smalldata/prostate/prostate_train.csv'
# test_file = '../../smalldata/prostate/prostate_test.csv'
# prediction_column = 'CAPSULE'

train_file = "../../smalldata/testng/airlines_train_preprocessed.csv"
test_file = "../../smalldata/testng/airlines_test_preprocessed.csv"
prediction_column = 'IsDepDelayed'

In [11]:
data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)

In [12]:
data_train

Unnamed: 0,IsDepDelayed,fYear,fMonth,fDayofMonth,fDayOfWeek,UniqueCarrier,Origin,Dest,Distance
0,1,1987,10,14,3,0,0,0,447
1,0,1987,10,18,7,0,0,0,447
2,1,1987,10,19,1,0,0,0,447
3,0,1987,10,21,3,0,0,0,447
4,1,1987,10,23,5,0,0,0,447
...,...,...,...,...,...,...,...,...,...
95,1,1987,10,30,5,0,4,0,337
96,1,1987,10,1,4,0,5,4,370
97,1,1987,10,3,6,0,5,4,370
98,1,1987,10,5,1,0,5,4,370


In [20]:
clf = tree.DecisionTreeClassifier(criterion="entropy").fit(data_train.drop([prediction_column], axis=1), data_train[prediction_column])

In [21]:
y_pred = clf.predict(data_test.drop([prediction_column], axis=1))

In [22]:
y_test = data_test[prediction_column] 

In [23]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)


In [24]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Precision: %.3f' % precision_score(y_test, y_pred))
print('Recall: %.3f' % recall_score(y_test, y_pred))
print('F1: %.3f' % f1_score(y_test, y_pred))


Accuracy: 0.460
Precision: 0.481
Recall: 0.765
F1: 0.591


In [25]:
y_pred

array([1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1])

In [33]:
def preprocess_airlines_data(data: pd.DataFrame):
    for c in data.columns:
        # print(data[c].dtype)
        # print(data[c].value_counts())
        data.IsDepDelayed = data.IsDepDelayed.apply(lambda v: v if v in [0, 1] else (1 if v == 'YES' else 0))
        data.fYear = data.fYear.apply(lambda v: v.replace("f", ""))
        data.fMonth = data.fMonth.apply(lambda v: v.replace("f", ""))
        data.fDayofMonth = data.fDayofMonth.apply(lambda v: v.replace("f", ""))
        data.fDayOfWeek = data.fDayOfWeek.apply(lambda v: v.replace("f", ""))
        # unique_carrier = {"US": 0, "UA": 1, "HP": 2, "PS": 3, "WN": 4, "DL": 5, "PI": 6, "AA": 7, "TW": 8, "CO": 9}
        unique_carrier = list(data.UniqueCarrier.unique())
        data.UniqueCarrier = data.UniqueCarrier.apply(lambda v: unique_carrier.index(v) if v in unique_carrier else v)
        origin = list(data.Origin.unique())
        data.Origin = data.Origin.apply(lambda v: origin.index(v) if v in origin else v)
        dest = list(data.Dest.unique())
        data.Dest = data.Dest.apply(lambda v: dest.index(v) if v in dest else v)
        print(data)
    return data.sample(len(data), random_state=5)

In [34]:
preprocess_airlines_data(pd.read_csv("../../smalldata/testng/airlines_train.csv")).to_csv("../../smalldata/testng/airlines_train_preprocessed.csv", index=False)
preprocess_airlines_data(pd.read_csv("../../smalldata/testng/airlines_test.csv")).to_csv("../../smalldata/testng/airlines_test_preprocessed.csv", index=False)

       IsDepDelayed fYear fMonth fDayofMonth fDayOfWeek  UniqueCarrier  \
0                 1  1987     10          14          3              0   
1                 0  1987     10          18          7              0   
2                 1  1987     10          19          1              0   
3                 0  1987     10          21          3              0   
4                 1  1987     10          23          5              0   
...             ...   ...    ...         ...        ...            ...   
24416             1  2000      1           5          3              2   
24417             1  2000      1           6          4              2   
24418             1  2000      1           7          5              2   
24419             1  2000      1           8          6              2   
24420             1  2000      1           9          7              2   

       Origin  Dest  Distance  
0           0     0       447  
1           0     0       447  
2           0  