<a href="https://colab.research.google.com/github/denistoo749/Flight-delays/blob/main/flight_delays.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Flight delays
**1. Problem**
- Predict whether a flight will be delayed for more than 15 minutes.

**2. Data**
- flight_delays_train.csv – training set
-flight_delays_test.csv – test set
-sample_submission.csv

>https://www.kaggle.com/competitions/flight-delays-fall-2018/data

**3. Evaluation**
- Target metric is ROC AUC.

**4. Features:**
```
Month, DayofMonth, DayOfWeek
DepTime – departure time
UniqueCarrier – code of a company-career
Origin – flight origin
Dest – flight destination
Distance, distance between Origin and Dest airports
dep_delayed_15min – target
```

In [1]:
# Unzip file
#!unzip '/content/drive/MyDrive/Flight delays/flight-delays-fall-2018.zip' -d '/content/drive/MyDrive/Flight delays/'

In [2]:
# Unzip file
#!unzip '/content/drive/MyDrive/Flight delays/flight_delays_train.csv.zip' -d '/content/drive/MyDrive/Flight delays/data/'

# Setup

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [4]:
# Read the train dataset
train_df = pd.read_csv('/content/drive/MyDrive/Flight delays/data/flight_delays_train.csv')
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [5]:
train_df['dep_delayed_15min'].unique()

array(['N', 'Y'], dtype=object)

In [6]:
train_df.isna().sum()

Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
UniqueCarrier        0
Origin               0
Dest                 0
Distance             0
dep_delayed_15min    0
dtype: int64

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Month              100000 non-null  object
 1   DayofMonth         100000 non-null  object
 2   DayOfWeek          100000 non-null  object
 3   DepTime            100000 non-null  int64 
 4   UniqueCarrier      100000 non-null  object
 5   Origin             100000 non-null  object
 6   Dest               100000 non-null  object
 7   Distance           100000 non-null  int64 
 8   dep_delayed_15min  100000 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


In [8]:
x = train_df.drop('dep_delayed_15min', axis=1)
y = train_df['dep_delayed_15min']

In [9]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Month',	'DayofMonth',	'DayOfWeek',	'DepTime',	'UniqueCarrier',	'Origin',	'Dest',	'Distance']
one_hot = OneHotEncoder(handle_unknown='ignore')
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder='passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x

<100000x3260 sparse matrix of type '<class 'numpy.float64'>'
	with 800000 stored elements in Compressed Sparse Row format>

In [10]:
pd.DataFrame(transformed_x)

Unnamed: 0,0
0,"(0, 10)\t1.0\n (0, 25)\t1.0\n (0, 49)\t1.0..."
1,"(0, 6)\t1.0\n (0, 24)\t1.0\n (0, 45)\t1.0\..."
2,"(0, 11)\t1.0\n (0, 23)\t1.0\n (0, 47)\t1.0..."
3,"(0, 2)\t1.0\n (0, 29)\t1.0\n (0, 48)\t1.0\..."
4,"(0, 1)\t1.0\n (0, 40)\t1.0\n (0, 48)\t1.0\..."
...,...
99995,"(0, 7)\t1.0\n (0, 37)\t1.0\n (0, 45)\t1.0\..."
99996,"(0, 0)\t1.0\n (0, 21)\t1.0\n (0, 45)\t1.0\..."
99997,"(0, 0)\t1.0\n (0, 28)\t1.0\n (0, 44)\t1.0\..."
99998,"(0, 6)\t1.0\n (0, 31)\t1.0\n (0, 46)\t1.0\..."


In [11]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2, random_state=42)

In [12]:
# Fit the model
np.random.seed(42)

model = RandomForestClassifier()
model.fit(x_train, y_train)

In [13]:
model.score(x_test, y_test)

0.81125

In [14]:
# Unzip test file
#!unzip '/content/drive/MyDrive/Flight delays/data/flight_delays_test.csv.zip' -d '/content/drive/MyDrive/Flight delays/data/'

In [15]:
test_df = pd.read_csv('/content/drive/MyDrive/Flight delays/data/flight_delays_test.csv')
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [16]:
# Transform the test dataset using the same transformer
transformed_test_x = transformer.transform(test_df)

In [17]:
y_probs = model.predict_proba(transformed_test_x)
y_probs_positive = y_probs[:, 1]

In [18]:
sub = pd.DataFrame({'id': range(len(test_df)), 'dep_delayed_15min': y_probs_positive})
sub.to_csv('/content/drive/MyDrive/Flight delays/data/submission.csv', index=False)

In [19]:
submission = pd.read_csv('/content/drive/MyDrive/Flight delays/data/submission.csv')
submission.head()

Unnamed: 0,id,dep_delayed_15min
0,0,0.05
1,1,0.08
2,2,0.21
3,3,0.3
4,4,0.21


# Hyperparameter Tuning with RandomizedSearchCV

In [28]:
grid = {'n_estimators': [100, 200, 500, 1200],
       'max_depth': [None, 10, 20, 30],
       'min_samples_split': [2, 4, 6],
       'min_samples_leaf': [1, 2, 4]}

# Setup RandomizedSearchCV
rs_model = RandomizedSearchCV(estimator=model,
                           param_distributions=grid,
                           n_iter=10, # number of models to try
                           cv=5,
                           verbose=True)

# Fit the model
rs_model.fit(x_train, y_train)

best_params = rs_model.best_params_
best_score = rs_model.best_score_

print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters: {'n_estimators': 500, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_depth': None}
Best score: 0.8091750000000001


In [29]:
# Prediction probabilities
y_probs = rs_model.predict_proba(transformed_test_x)
y_probs_positive = y_probs[:, 1]

In [30]:
sub = pd.DataFrame({'id': range(len(test_df)), 'dep_delayed_15min': y_probs_positive})
sub.to_csv('/content/drive/MyDrive/Flight delays/data/submission.csv', index=False)

In [31]:
submission = pd.read_csv('/content/drive/MyDrive/Flight delays/data/submission.csv')
submission.head()

Unnamed: 0,id,dep_delayed_15min
0,0,0.156036
1,1,0.151478
2,2,0.181222
3,3,0.204899
4,4,0.257624


# Hyperparameter Tuning with GridSearchCV