In [1]:
# Import relevant packages
import pandas as pd
import numpy as np
import warnings
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import airportsdata
# installing airport data package
#!pip install -U airportsdata
import datetime 
import warnings
warnings.filterwarnings('ignore')

In [2]:
# introducing each dataset 
d_test='data/Test.csv'
d_sample='data/SampleSubmission.csv'
d_train='data/Train.csv'

In [3]:
df_test=pd.read_csv(d_test)
df_test.head(10)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
0,test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF
1,test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW
2,test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC
3,test_id_3,2016-05-11,TU 0848,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13.10.00,ATA,TU 31BIMO
4,test_id_4,2016-05-11,TU 0635,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12.35.00,ATA,TU 736IOQ
5,test_id_5,2016-05-11,UG 1313,CDG,SFA,2016-05-11 11:10:00,2016-05-11 12.40.00,DEP,TU CR9ISA
6,test_id_6,2016-05-11,TU 0515,BCN,TUN,2016-05-11 10:40:00,2016-05-11 12.15.00,ATA,TU 32AIMF
7,test_id_7,2016-05-11,TU 0712,CMN,TUN,2016-05-11 10:35:00,2016-05-11 13.00.00,ATA,TU 32AIMN
8,test_id_8,2016-05-11,TU 0751,LYS,TUN,2016-05-11 10:40:00,2016-05-11 12.35.00,ATA,TU 320IMS
9,test_id_9,2016-05-11,TU 0745,FRA,TUN,2016-05-11 10:30:00,2016-05-11 12.55.00,ATA,TU 32AIMC


In [4]:
df_sample=pd.read_csv(d_sample)
df_sample.head(2)

Unnamed: 0,ID,target
0,test_id_0,2470
1,test_id_1,2944


In [5]:
df_train=pd.read_csv(d_train)
df_train.head(10)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0
5,train_id_5,2016-01-17,TU 0283,TLS,TUN,2016-01-17 16:20:00,2016-01-17 18.15.00,ATA,TU 736IOP,53.0
6,train_id_6,2016-01-18,TU 0514,TUN,BCN,2016-01-18 07:15:00,2016-01-18 09.00.00,ATA,TU 32AIMH,10.0
7,train_id_7,2016-01-18,TU 0716,TUN,ORY,2016-01-18 07:35:00,2016-01-18 09.55.00,ATA,TU 32AIMI,15.0
8,train_id_8,2016-01-18,TU 0752,TUN,FCO,2016-01-18 07:40:00,2016-01-18 09.00.00,ATA,TU 32AIMC,16.0
9,train_id_9,2016-01-18,TU 0996,TUN,NCE,2016-01-18 07:45:00,2016-01-18 09.15.00,ATA,TU 31AIMK,21.0


**Variables definition:**

* DATOP - Date of flight
* FLTID - Flight number
* DEPSTN - Departure point
* ARRSTN - Arrival point
* STD - Scheduled Time departure
* STA - Scheduled Time arrival
* STATUS - Flight status
* ETD - Expected Time departure
* ETA - Expected Time arrival
* ATD - Actual Time of Departure
* ATA - Actual Time of arrival
* DELAY1 - Delay code 1
* DUR1 - delay time 1
* DELAY2 - Delay code 2
* DUR2 - delay time 2
* DELAY3 - Delay code 3
* DUR3 - delay time 3
* DELAY4 - Delay code 4
* DUR4 - delay time 4
* AC - Aircraft Code

# Data Cleaning and Feature Engineering

### Data Cleaning

In [6]:
df=df_train.copy()

In [7]:
# Size
df.shape

(107833, 10)

In [8]:
# Some Infos about the Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107833 entries, 0 to 107832
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      107833 non-null  object 
 1   DATOP   107833 non-null  object 
 2   FLTID   107833 non-null  object 
 3   DEPSTN  107833 non-null  object 
 4   ARRSTN  107833 non-null  object 
 5   STD     107833 non-null  object 
 6   STA     107833 non-null  object 
 7   STATUS  107833 non-null  object 
 8   AC      107833 non-null  object 
 9   target  107833 non-null  float64
dtypes: float64(1), object(9)
memory usage: 8.2+ MB


In [9]:
# target describe
df.describe()

Unnamed: 0,target
count,107833.0
mean,48.733013
std,117.135562
min,0.0
25%,0.0
50%,14.0
75%,43.0
max,3451.0


In [10]:
# non Nan Values
df.isnull().sum()

ID        0
DATOP     0
FLTID     0
DEPSTN    0
ARRSTN    0
STD       0
STA       0
STATUS    0
AC        0
target    0
dtype: int64

#### Renaming Columns

In [11]:
# convert column names to lower case and omit the space before or between or after column names
df.columns= df.columns.str.lower().str.strip()
df.columns

Index(['id', 'datop', 'fltid', 'depstn', 'arrstn', 'std', 'sta', 'status',
       'ac', 'target'],
      dtype='object')

#### Finding out more about the data types of columns

In [12]:
# finding the data types of each column
df.dtypes

id         object
datop      object
fltid      object
depstn     object
arrstn     object
std        object
sta        object
status     object
ac         object
target    float64
dtype: object

In [13]:
# Number of Unique Variable 
df.nunique()

id        107833
datop       1011
fltid       1861
depstn       132
arrstn       128
std        81697
sta        85136
status         5
ac            68
target       968
dtype: int64

In [15]:
# find the categories of the categorical feature 'status'
df.status.unique()

array(['ATA', 'DEP', 'RTR', 'SCH', 'DEL'], dtype=object)

* ATA	Actual Time Of Arrival	
* DEP	Depart or Departure 
* RTR   Return to Ramp
* SCH   Scheduled change 
* DEL   


In [16]:
df.status.value_counts()

ATA    93679
SCH    13242
DEP      467
RTR      294
DEL      151
Name: status, dtype: int64

In [17]:
# change "datop" dtype to datetime with format %Y/%m/%d
df['datop'] = pd.to_datetime(df['datop'], format='%Y/%m/%d')

In [18]:
# change "std" dtype to datetime with format %Y/%m/%d
df['std'] = pd.to_datetime(df['std'], format='%Y/%m/%d %H:%M:%S')

In [19]:
# change "sta" format for example from  12.55.00 to 12:55:00
df['sta']=df['sta'].apply(lambda x: x.replace(".",":"))

In [20]:
# change "sta" dtype to datetime with format %Y/%m/%d
df['sta'] = pd.to_datetime(df['sta'], format='%Y/%m/%d %H:%M:%S')

In [21]:
df.head(1)

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,status,ac,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12:55:00,ATA,TU 32AIMN,260.0


In [23]:
# some Infos about Data target Where Status == 'ATA' And Target > 0
df[(df.status == 'ATA') & (df.target > 0)].describe()

Unnamed: 0,target
count,69211.0
mean,75.330713
std,138.141402
min,1.0
25%,15.0
50%,30.0
75%,74.0
max,3451.0


Because Status Values related to the Traget make no Sense, we decided to drope the satus Column 

In [24]:
df.drop(columns='status', inplace = True)

In [25]:
df.head()

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12:55:00,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16:55:00,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,TU 32AIMN,0.0
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17:00:00,TU 736IOK,0.0
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15:50:00,TU 320IMU,22.0


In [None]:
# knowing how many status are 'ATA' and also having more than 0 minute delay in target . 
#df[(df.status =='ATA') & (df.target != 0.0)]

### Checking for Duplicates

In [26]:
# making Copy to Check for Duplication
df_copy = df.copy()

In [27]:
# drop some Colum to analyse  Duplication
df_copy.drop(columns= ['id','target'], inplace=True )

In [28]:
df_copy.duplicated().sum()

5

In [29]:
df_copy[df_copy.duplicated()]

Unnamed: 0,datop,fltid,depstn,arrstn,std,sta,ac
23335,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR
24606,2016-10-13,TU 0752,TUN,TUN,2016-10-13 08:00:00,2016-10-13 09:20:00,TU 31BIMO
26716,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR
41876,2017-05-27,TU 0440,MIR,MIR,2017-05-27 11:50:00,2017-05-27 14:15:00,TU 320IMS
57786,2017-09-01,TU 0752,TUN,TUN,2017-09-01 08:20:00,2017-09-01 09:45:00,TU 320IMV


### now we check all these 5 Duplication and Decide if we keep it or drop it

In [30]:
# First Duplication --> different Target so we will keep it 
df[(df.datop == '2016-10-13') & (df.depstn == 'LYS') & (df.arrstn == 'LYS')]

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target
21099,train_id_21099,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR,35.0
23335,train_id_23335,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR,1831.0
26716,train_id_26716,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR,235.0


In [60]:
mean_1 =round(df[(df.datop == '2016-10-13') & (df.depstn == 'LYS') & (df.arrstn == 'LYS')].target.mean(),2)
mean_1

700.33

In [64]:
#df.xs('C')['x'] = 10
df[(df.datop == '2016-10-13') & (df.depstn == 'LYS') & (df.arrstn == 'LYS')].xs(21099)['target'] = mean_1

In [77]:
#df[(df.datop == '2016-10-13') & (df.depstn == 'LYS') & (df.arrstn == 'LYS')].set_value(21099,'target', mean_1)

In [73]:
df[(df.datop == '2016-10-13') & (df.depstn == 'LYS') & (df.arrstn == 'LYS')]

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target
21099,train_id_21099,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR,35.0
23335,train_id_23335,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR,1831.0
26716,train_id_26716,2016-10-13,TU 0431,LYS,LYS,2016-10-13 15:55:00,2016-10-13 18:00:00,TU 736IOR,235.0


In [None]:
# dropping the 2 least minute target delay rows from the duplications
#df=df.drop(index=[21099 , 26716])

In [38]:
# Second Duplication --> different Target so we will keep it 
df[(df.datop == '2016-10-13') & (df.depstn == 'TUN') & (df.arrstn == 'TUN') &(df.ac == 'TU 31BIMO')]

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target
23614,train_id_23614,2016-10-13,TU 0752,TUN,TUN,2016-10-13 08:00:00,2016-10-13 09:20:00,TU 31BIMO,0.0
24606,train_id_24606,2016-10-13,TU 0752,TUN,TUN,2016-10-13 08:00:00,2016-10-13 09:20:00,TU 31BIMO,42.0


In [32]:
# Third Duplication --> different Target so we will keep it 
df[(df.datop == '2017-05-27') & (df.depstn == 'MIR')& (df.arrstn == 'MIR') & (df.fltid == 'TU 0440 ')]

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target
39467,train_id_39467,2017-05-27,TU 0440,MIR,MIR,2017-05-27 11:50:00,2017-05-27 14:15:00,TU 320IMS,113.0
41876,train_id_41876,2017-05-27,TU 0440,MIR,MIR,2017-05-27 11:50:00,2017-05-27 14:15:00,TU 320IMS,48.0


In [33]:
# forth Duplicate --> different Target so we will keep it 
df[(df.datop == '2017-09-01') & (df.depstn == 'TUN')& (df.arrstn == 'TUN') & (df.fltid == 'TU 0752 ')]

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target
54682,train_id_54682,2017-09-01,TU 0752,TUN,TUN,2017-09-01 08:20:00,2017-09-01 09:45:00,TU 320IMV,148.0
57786,train_id_57786,2017-09-01,TU 0752,TUN,TUN,2017-09-01 08:20:00,2017-09-01 09:45:00,TU 320IMV,30.0


In [34]:
# The Duplication
df[df.duplicated()]

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,ac,target


In [35]:
df.duplicated().sum()

0

### Some Investigation

In [None]:
# Flights that starts from RTM have the Biggest delay 
df.groupby(['depstn']).describe().sort_values(by=('target', 'mean'), ascending=False).head(10)

In [None]:
# Flights that arrive from RTM have the Biggest delay 
df.groupby(['arrstn']).describe().sort_values(by=('target', 'mean'), ascending=False).head(10)

In [None]:
#
df.sort_values(by='target', ascending=False)

In [None]:
df.groupby(['arrstn']).describe()

In [None]:
# airport data information 
airports = airportsdata.load('IATA')  # key is the IATA location code
print(airports['TUN'])

In [None]:
airports['TUN']['lat']

In [None]:
df.head(10)

In [None]:
#df[(df.fltid == 'TU 0716 ')&(df.status =='RTR')][['datop','depstn','arrstn','fltid','std','sta','status','ac','target']].sort_values(by='std', ascending=True).head(40)

In [None]:
# invistigation the SXF Airoport
df[df.arrstn=='SXF']

In [None]:
# because SXF is old name of berlin airport and now it is BER we have to change all SXF in data set to BER
df['arrstn']=df['arrstn'].apply(lambda x: x.replace ('SXF','BER'))

In [None]:
# because SXF is old name of berlin airport and now it is BER we have to change all SXF in data set to BER
df['depstn']=df['depstn'].apply(lambda x: x.replace ('SXF','BER'))

In [None]:
# we made a new column as 'lat_arr' for the latitude of arrival destination ('arrstn')
df['lat_arr']=df['arrstn'].apply(lambda x: airports[x]['lat'])
# we made a new column as 'lon_arr' for the longitude of arrival destination ('arrstn')
df['lon_arr']=df['arrstn'].apply(lambda x: airports[x]['lon'])


In [None]:
# we made a new column as 'lat_dep' for the latitude of departure destination ('dpstn')
df['lat_dep']=df['depstn'].apply(lambda x: airports[x]['lat'])
# we made a new column as 'lon_dep' for the longitude of departure destination ('dpstn')
df['lon_dep']=df['depstn'].apply(lambda x: airports[x]['lon'])

In [None]:
def make_season(x):
    if  x =="03" or x =="04" or  x =="05":
        return "Spring"
    elif x =="06" or x =="07" or  x =="08":
        return "Summer"
    elif x =="09" or x =="10" or  x =="11":
        return "Autumn"
    else :
        return "Winter"

In [None]:
df.head(1)

In [None]:
df['year'] = pd.DatetimeIndex(df['datop']).year
df['month'] = df['datop'].dt.strftime('%m')
df['Season'] =  df.month.apply(lambda x: make_season(x))
df['weekday'] = pd.DatetimeIndex(df['datop']).weekday
# Arival Hour 
df['hour'] = pd.DatetimeIndex(df['std']).hour

In [None]:
# df['year_sta'] = pd.DatetimeIndex(df['sta']).year
# df['month_sta'] = df['sta'].dt.strftime('%m')
# df['Season_sta'] =  df.month_std.apply(lambda x: make_season(x))
# df['weekday_sta'] = pd.DatetimeIndex(df['sta']).weekday
# df['hour_sta'] = pd.DatetimeIndex(df['sta']).hour

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
#  std and datop muss be on The same Day
df[df['std'].dt.strftime('%Y-%m-%d') == df.datop].shape[0] == df.shape[0]

In [None]:
df.target.describe()

In [None]:
df_2=df[df.target==0 ]
df_2.head(1)

In [None]:
type(df_2.target[2])

In [None]:
# Trying the One-hot-encoding On Sex
# One_hot = pd.get_dummies(df.status)
# One_hot

# df = df.join(One_hot)
# df.head()

In [None]:
# cor = df[(df.target !=0)].corr()
# plt.figure(figsize=(15,10))
# sns.heatmap(cor, annot=True, cmap=plt.cm.Greens)
# plt.show()

In [None]:
# cor = df.corr()
# plt.figure(figsize=(15,10))
# sns.heatmap(cor, annot=True, cmap=plt.cm.Greens)
# plt.show()

In [None]:
# Plot correlation matrix 
mask = np.triu(df.corr())
plt.figure(figsize = (15,10))
ax = sns.heatmap(round(df.corr(), 1)
                 ,annot=True
                 ,mask=mask
                 ,cmap='RdBu_r')

In [None]:
df['trajectory'] = df['depstn'] + '-' + df['arrstn']
df['trajectory_duration'] = df['sta'] - df['std']

In [None]:
#df['trajectory_duration'] = pd.DatetimeIndex(df['trajectory_duration']).hour

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
le = LabelEncoder()
# enc = OneHotEncoder(sparse=False)
df['depstn'] = le.fit_transform(df['depstn'])
df['arrstn'] = le.fit_transform(df['arrstn'])
df['ac_cat'] = le.fit_transform(df['ac'])
df['fltid_cat'] = le.fit_transform(df['fltid'])
df['trajectory_cat'] = le.fit_transform(df['trajectory'])
df['season_cat'] = le.fit_transform(df['Season'])

In [None]:
df.head(1)

### Visualisation

In [None]:
#sns.pairplot(data=df_clean , vars=['id', 'datop', 'fltid', 'depstn', 'arrstn', 'std', 'sta', 'status','ac', 'target'])

In [None]:
# sns.pairplot(data=df , hue='status')

In [None]:
df.head(1)

In [None]:
df.columns

In [None]:
fig = plt.figure(1, figsize=(25,25))
df2 = df.loc[:, ['ac', 'target']]
df2 = df2.drop(df2[df2.target == 0].index)
ax = sns.stripplot(x="target", y="ac", data=df2, size = 12, linewidth = 2,  jitter=True)
plt.xlabel('Departure delay', fontsize=18, bbox={'facecolor':'midnightblue', 'pad':5},
           color='w', labelpad=30)
ax.yaxis.label.set_visible(False)
plt.tight_layout(w_pad=3) 

In [None]:
df.head()

In [None]:
# Function that define how delays are grouped
delay_type = lambda x:((0,1)[x > 5],2)[x > 45]
df['delay_level'] = df['target'].apply(delay_type)
#____________________________________________________
fig = plt.figure(1, figsize=(70,70))
ax = sns.countplot(y="ac", hue='delay_level', data=df)
#____________________________________________________________________________________
# We replace the abbreviations by the full names of the companies and set the labels
labels = df['ac'].unique().tolist()
ax.set_yticklabels(labels)
plt.setp(ax.get_xticklabels(), fontsize=24, weight = 'normal', rotation = 0);
plt.setp(ax.get_yticklabels(), fontsize=24, weight = 'bold', rotation = 0);
ax.yaxis.label.set_visible(False)
plt.xlabel('Flight count', fontsize=24, weight = 'bold', labelpad=10)
#________________
# Set the legend
L = plt.legend()
L.get_texts()[0].set_text('on time (t < 5 min)')
L.get_texts()[1].set_text('small delay (5 < t < 45 min)')
L.get_texts()[2].set_text('large delay (t > 45 min)')
plt.show()

**Relationship between the origin airport and delays**

In [None]:
print("Nb of airports: {}".format(df['depstn'].nunique()))

In [None]:
list_of_airpots = df['depstn'].unique().tolist()
list_of_ac = df['ac'].unique().tolist()
origin_nb = dict()
for carrier in list_of_ac:
    liste_origin_airport = df[df['ac'] == carrier]['depstn'].unique().tolist()
    origin_nb[carrier] = len(liste_origin_airport)


In [None]:
test_df = pd.DataFrame.from_dict(origin_nb, orient='index')
test_df.rename(columns = {0:'count'}, inplace = True)
ax = test_df.plot(kind='bar', figsize = (25,25))
labels = [x for x in list_of_ac]
ax.set_xticklabels(labels)
plt.ylabel('Number of airports visited', fontsize=14, weight = 'bold', labelpad=12)
plt.setp(ax.get_xticklabels(), fontsize=11, ha = 'right', rotation = 80)
ax.legend().set_visible(False)
plt.show()

**number of flights per year**

In [None]:
df.head(1)

In [None]:
flights_dict = dict()
ac_group = df.groupby('ac')
for x in list_of_ac:
    df_ac = ac_group.get_group(x)
    flights_dict[x] = df[df['ac'] == x]['year'].value_counts().to_dict()

In [None]:
flights_dict

In [None]:
# flights_dict
nbr_flights = pd.DataFrame.from_dict(flights_dict, orient='index')
nbr_flights.fillna(0, inplace=True)
# nbr_flights

In [None]:
nbr_flights.columns = ['flights_in_2018', 'flights_in_2017', 'flights_in_2016']

In [None]:
df.head(1)

# ML

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df_log=df[['depstn', 'arrstn', 'year', 'month',
       'season_cat', 'weekday', 'hour',
       'ac_cat', 'fltid_cat', 'trajectory_cat', 'delay_level']]

In [None]:
# sns.pairplot(df_log, hue='delay_level', kind='scatter', diag_kind='hist', height=2.5 , size=None) 

In [None]:
flights_dict = dict()
ac_group = df.groupby('ac')
for x in list_of_ac:
    df_ac = ac_group.get_group(x)
    flights_dict[x] = df[df['ac'] == x]['year'].value_counts().to_dict()

In [None]:
# Plotting the target variable
plt.title('delay_level')
sns.countplot(x=df_log.delay_level)

In [None]:
# Compute correlations
correlations = df_log.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(correlations)
mask[np.triu_indices_from(mask)] = True

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlations, mask=mask, cmap=cmap, vmax=1, annot=True,
            linewidths=.5, cbar_kws={"shrink": .7});

## Splitting data for testing 

In [None]:
# Defining X and y
features = df_log.columns.tolist()
features.remove('delay_level')
X = df_log[features]
y = df_log.delay_level

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=150, shuffle=True)

# Check the shape of the data sets
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

In [None]:
X

In [None]:
X_train.hour.value_counts()

### Base Line Model ###

In [None]:
#Baseline Model: Every flight is started between 9-18 has a small delay
def baseline_model(base):
    y_pred = [True if x >=9 and x<=18 else False for x in base.hour]
    return y_pred

In [None]:
# Compute predictions with baseline model for test set
y_baseline_test = baseline_model(X_test)

In [None]:
# Plot confusion matrix for baseline model
cm = confusion_matrix(y_test, y_baseline_test)
sns.heatmap(cm, cmap="YlGnBu", annot=True, fmt='d');

In [None]:
accuracy_score(y_test, y_baseline_test)

In [None]:
df_log.dtypes

In [None]:
# Logistic Regression
from sklearn.metrics import confusion_matrix, classification_report, f1_score
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_train = log_reg.predict(X_train)
y_pred = log_reg.predict(X_test)

# Print accuracy of our model
print("Accuracy on train set:", round(accuracy_score(y_train, y_pred_train), 2))
print("Accuracy on test set:", round(accuracy_score(y_test, y_pred), 2))
print("--------"*10)

# Print classification report of our model
print(classification_report(y_test, y_pred))
print("--------"*10)

# Evaluate the model with a confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, cmap='YlGnBu', annot=True, fmt='d', linewidths=.5);

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_train["altitude_mean_log"] = np.log(X_train["altitude_mean_meters"])
X_train.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_train.drop(['Quakers'], axis=1, inplace=True)
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
X_train.info()

In [None]:
altitude_low_meters_mean = X_train["altitude_low_meters"].mean()
altitude_high_meters_mean = X_train["altitude_high_meters"].mean()
altitude_mean_log_mean = X_train["altitude_mean_log"].mean()

In [None]:
# fillna with mean.. 
X_train["altitude_low_meters"] = X_train["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_train["altitude_high_meters"] = X_train["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_train["altitude_mean_log"] = X_train["altitude_mean_log"].fillna(altitude_mean_log_mean)

In [None]:
print(f"altitude low meters mean is {altitude_low_meters_mean}")
print(f"altitude_high_meters_mean is {altitude_high_meters_mean}")
print(f"altitude_mean_log_mean is {altitude_mean_log_mean}")

## Trainining the model

In [None]:
## in order to exemplify how the predict will work.. we will save the y_train
X_test.to_csv("data/X_test.csv")
y_test.to_csv("data/y_test.csv")

In [None]:
#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_train_pred = reg.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
print(mse)

In [None]:
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_test["altitude_mean_log"] = np.log(X_test["altitude_mean_meters"])
X_test.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_test.drop(['Quakers'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
# fillna with mean.. 
X_test["altitude_low_meters"] = X_test["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_test["altitude_high_meters"] = X_test["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_test["altitude_mean_log"] = X_test["altitude_mean_log"].fillna(altitude_mean_log_mean)

In [None]:
y_test_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print(mse)