## Decision Trees for multiclassification

### Import Data

In [2]:
#! pip install xgboost
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

### Read all the CSV files

In [3]:
%%time
df= pd.read_csv('CSV/flights_fact.csv',low_memory=False)

CPU times: total: 16.2 s
Wall time: 17.5 s


In [4]:
aircraft_dim= pd.read_csv('CSV/aircraft_dim.csv')
aircraft_dim = aircraft_dim.rename(columns={'N-Number': 'TAIL_NUMBER'})

In [5]:
airlines=pd.read_csv('CSV/airlines.csv')
airlines = airlines.rename(columns={'IATA_CODE': 'AIRLINE'})

In [6]:
date_dim=pd.read_csv('CSV/date_dim.csv')

In [7]:
arrivalAirport_dim=pd.read_csv('CSV/arrivalAirport_dim.csv')
arrivalAirport_dim = arrivalAirport_dim.rename(columns={'IATA_CODE': 'DESTINATION_AIRPORT'})

In [8]:
departureAirport_dim=pd.read_csv('CSV/departureAirport_dim.csv')
departureAirport_dim = departureAirport_dim.rename(columns={'IATA_CODE': 'ORIGIN_AIRPORT'})

In [9]:
departureTime_dim=pd.read_csv('CSV/departureTime_dim.csv')

In [10]:
arrivalTime_dim=pd.read_csv('CSV/arrivalTime_dim.csv')

* Merge the fact table with the dimension tables

In [11]:
%%time

df=pd.merge(df,aircraft_dim, on='TAIL_NUMBER', how='inner')
df=pd.merge(df,date_dim, on='dateKey', how='inner')
df=pd.merge(df,arrivalAirport_dim, on='DESTINATION_AIRPORT', how='inner')
df=pd.merge(df,departureAirport_dim, on='ORIGIN_AIRPORT', how='inner')
df=pd.merge(df,departureTime_dim,on='scheduledDeparture_key',how='inner')
df=pd.merge(df,arrivalTime_dim,on='scheduledArrival_key',how='inner')

CPU times: total: 26.1 s
Wall time: 27.5 s


* Create a new method called get_value that creates a new column called 'class' and assigns the values 0,1,2 to the classes early,on_time,delayed respectively if those classes appear to be 1 in the initial dataset.

In [None]:
%%time 

def get_value(df):
    if df['early'] == 1:
        return 0
    elif df['on_time'] == 1:
        return 1
    elif df['delayed'] == 1:
        return 2

df['class'] = df.apply(lambda df: get_value(df), axis=1)


### Print merged Dataframe 

* Create 2 methods to normalize data of scheduledDeparture_Time and scheduledArrival_Time in order to be used correctly by the model

In [10]:
def subtract_columns_Departure(row):
    return row['DEPARTURE_TIME'] - row['DEPARTURE_DELAY']

def subtract_columns_Arrival(row):
    return row['ARRIVAL_TIME'] - row['ARRIVAL_DELAY']

Normalize data and apply the changes to the columns

In [None]:
df['scheduledDeparture_key'] = df.apply(subtract_columns_Departure, axis=1)
df['scheduledArrival_key'] = df.apply(subtract_columns_Arrival, axis=1)

In [None]:
df.columns

### Remove unwanted columns

In [None]:
df=df.drop(['YEAR','FLIGHT_NUMBER','TAIL_NUMBER','TAXI_OUT','WHEELS_OFF','ELAPSED_TIME', 'AIR_TIME','WHEELS_ON',
            'TAXI_IN','CANCELLATION_REASON','ARRIVAL_DELAY','early','on_time','delayed',
              'new_delay','DIVERTED','CANCELLED','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY',
                    'WEATHER_DELAY','YEAR','LATE_AIRCRAFT_DELAY','num_engines','num_passengers','Year Mfr','TypeAcft',
           'type','AircraftMFRModelCode','COUNTRY_x','COUNTRY_y'], axis=1)


In [None]:
df

In [None]:
df.isna().sum()

In [None]:
df.dropna(subset=['LATITUDE_x','LATITUDE_y','LONGITUDE_x','LONGITUDE_y'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.info()

### Perform Correlation Statistic Test

In [None]:
sns.heatmap(df.corr())

In [None]:
#columns=['FLIGHT_NUMBER','TAIL_NUMBER','scheduledDeparture_key','DEPARTURE_DELAY','TAXI_OUT','WHEELS_OFF',
               #'AIR_TIME','WHEELS_ON','TAXI_IN','scheduledArrival_key','ARRIVAL_DELAY','early','on_time','delayed',
              #'new_delay','class'])
# Select the target variable
y = df['class']

# Select the feature columns
X = df.drop(columns=['class'])

# Compute the correlation matrix
corr = X.corrwith(y)

# Sort the correlations in descending order
corr_sorted = corr.sort_values(ascending=False)

# Print the sorted correlations
corr_sorted



In [None]:
df

* Use the OneHotEncoder class from the sklearn.preprocessing module to perform one-hot encoding on the categorical column.

In [None]:
airlines = df['AIRLINE'].value_counts()
top_40or=df['ORIGIN_AIRPORT'].value_counts().head(40).index.to_list()
top_40ar=df['DESTINATION_AIRPORT'].value_counts().head(40).index.to_list()

In [None]:
ex_df=df[df['ORIGIN_AIRPORT'].isin(top_40or) & df['DESTINATION_AIRPORT'].isin(top_40ar)]
ex_df

In [None]:
%%time
# Create dummy variables for the 'AIRLINE' and 'ORIGIN_AIRPORT','DESTINATION_AIRPORT' columns
dummies = pd.get_dummies(ex_df['AIRLINE'])
dummies1 = pd.get_dummies(ex_df['ORIGIN_AIRPORT'])
dummies2 = pd.get_dummies(ex_df['DESTINATION_AIRPORT'])

dummies2

In [None]:
# Concatenate the dummy variables to the original dataframe
#df = pd.concat([df, dummies], axis=1)
ex_df=pd.concat([ex_df, dummies], axis=1)
ex_df=pd.concat([ex_df, dummies1], axis=1)
ex_df=pd.concat([ex_df, dummies2], axis=1)
ex_df
# Drop the original 'AIRLINE' and 'ORIGIN_AIRPORT','DESTINATION_AIRPORT' columns
ex_df = ex_df.drop(['AIRLINE', 'ORIGIN_AIRPORT','DESTINATION_AIRPORT'], axis=1)

In [None]:
ex_df.columns

* Info and analysis of data

In [None]:
ex_df.isnull().any().value_counts()

In [None]:
ex_df=ex_df.drop(['mfr_name','model_name','fullDate','month_name','day_name','AIRPORT_x','CITY_x','STATE_x',
              'AIRPORT_y','CITY_y','STATE_y'],axis=1)

In [None]:
ex_df.columns

In [None]:
# Select only the numerical columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.drop('class')

# Store the data from the numerical columns in a NumPy array
X = ex_df[numeric_columns].values

# Create the scaler
scaler = StandardScaler()

# Fit the scaler to the data
scaler.fit(X)

# Scale the data
X_scaled = scaler.transform(X)

# Update the data in the DataFrame
ex_df[numeric_columns] = X_scaled

In [None]:
ex_df

### Decision Trees

* Assign to x all the features that we will use for prediction and assign to y the class feature that we want to predict

In [None]:
x= ex_df.drop(['class','ARRIVAL_TIME','DEPARTURE_TIME','DAY_OF_WEEK','DAY','MONTH','Minute_y','Minute_x','Hour_x','Hour_y'],axis=1)  
y= ex_df['class']

In [None]:
x.columns

* Split to trainning and testing set with a 80:20 ratio

In [None]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.25,random_state=0)  

* Create a DecisionTreeClassifier based on the entropy

In [None]:
%%time
#Entropy
classifier= DecisionTreeClassifier(criterion='entropy', random_state=0)  
model=classifier.fit(x_train, y_train)  

* Predicting the test set result based on the Entropy

In [None]:
#Predicting the test set result (Entropy)
y_pred= classifier.predict(x_test)

In [None]:
print(classifier.tree_.max_depth)

 Create a DecisionTreeClassifier based on the gini index

In [None]:
%%time
#Gini
classifier1= DecisionTreeClassifier(criterion='gini', random_state=0)  
model1=classifier1.fit(x_train, y_train) 

* Predicting the test set result based on the gini index

In [None]:
#Predicting the test set result (Gini)
y_pred1= classifier1.predict(x_test)

### Print the classification report and interpret the results

In [None]:
# Print the classification report (Entropy)
report = classification_report(y_test, y_pred)
print(report)

# Print the F1 score
f1 = f1_score(y_test, y_pred, average="micro")
print("F1 score:", f1)

In [None]:
# Print the classification report(Gini)
report = classification_report(y_test, y_pred1)
print(report)

# Print the F1 score
f1 = f1_score(y_test, y_pred1, average="micro")
print("F1 score:", f1)

### XGBOOST

In [None]:
# Remove duplicate columns
ex_df = ex_df.loc[:, ~ex_df.columns.duplicated()]

In [None]:
x= ex_df.drop(['class','ARRIVAL_TIME','DEPARTURE_TIME','MONTH','Hour_x','Hour_y'],axis=1)  
y= ex_df['class']

In [None]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.25,random_state=0)  

In [None]:
%%time
# Create the classifier
clf = XGBClassifier(max_depth=20,n_estimators=100)

# Fit the classifier to the training data
clf.fit(x_train, y_train)

# Make predictions on the test data
predictions = clf.predict(x_test)


In [None]:
# Print the classification report(Gini)
report = classification_report(y_test, predictions)
print(report)

# Print the F1 score
f1 = f1_score(y_test, predictions, average="micro")
print("F1 score:", f1)

In [None]:
# Calculate the confusion matrix for the test set
confusion_matrix = confusion_matrix(y_test, predictions)

# Print the confusion matrix
print(confusion_matrix)

In [None]:
# Plot the feature importance scores, excluding the first two columns
xgb.plot_importance(clf,importance_type='weight', max_num_features=11)

### Evaluate the model 

In [None]:
# Evaluate the model on the test data
test_score = clf.score(x_test, y_test)

# Evaluate the model on the training data
train_score = clf.score(x_train, y_train)

# Print the scores
print(f"Test score: {test_score:.3f}")
print(f"Train score: {train_score:.3f}")