In [1]:
import zipfile
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

import utils 
    
    
train = pd.read_csv("flight_delays_train.csv")
train

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y
...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N


In [2]:
utils.clean_alpha(train)
utils.encode(train)

train

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,8,21,7,1934,0,18,78,732,N
1,4,20,3,1548,18,217,171,834,N
2,9,2,5,1422,20,228,59,416,N
3,11,25,6,1015,15,78,175,872,N
4,10,7,6,1828,19,174,199,423,Y
...,...,...,...,...,...,...,...,...,...
99995,5,4,3,1618,15,246,224,199,N
99996,1,18,3,804,4,92,72,884,N
99997,1,24,2,1901,13,85,131,1076,N
99998,4,27,4,1515,12,79,107,140,N


In [3]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

X.shape, y.shape

((100000, 8), (100000,))

In [4]:
cols = X.columns.to_list()
X[cols] = X[cols].astype('int64')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   Month          100000 non-null  int64
 1   DayofMonth     100000 non-null  int64
 2   DayOfWeek      100000 non-null  int64
 3   DepTime        100000 non-null  int64
 4   UniqueCarrier  100000 non-null  int64
 5   Origin         100000 non-null  int64
 6   Dest           100000 non-null  int64
 7   Distance       100000 non-null  int64
dtypes: int64(8)
memory usage: 6.1 MB


In [5]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X = scaler.fit_transform(X)

X[0], type(X)

(array([ 0.42841367,  0.6022536 ,  1.53049018,  1.24371514, -1.83438959,
        -1.61742586, -0.82102413,  0.00452972]),
 numpy.ndarray)

In [6]:
y = y.map({'N':0, 'Y':1})
y = y.to_numpy(dtype=int)
y

array([0, 0, 0, ..., 0, 0, 0])

### Lightning

In [7]:
import pytorch_lightning as pl
import torchvision
import torch
from torch.utils.data import DataLoader
import torchmetrics
import torch.nn as nn
import torch.optim as optim


In [8]:
# convert into PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
 
# train-test split for evaluation of the model
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

(torch.Size([70000, 8]),
 torch.Size([30000, 8]),
 torch.Size([70000, 1]),
 torch.Size([30000, 1]))

In [9]:
train_loader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=64,)
#test_loader = DataLoader(list(zip(X_test,y_test)), shuffle=True, batch_size=16)
test_loader = DataLoader(X_test, shuffle=True, batch_size=16)

In [10]:


class Model(pl.LightningModule):
  def __init__(self):

    super(Model,self).__init__()
    
    self.input_layer = nn.Linear(8, 16)
    self.hidden_layer = nn.Linear(16, 8)
    self.output_layer = nn.Linear(8,1)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.loss = nn.MSELoss()
    self.accuracy = torchmetrics.Accuracy(task="binary")

  def forward(self, input):
    x = self.input_layer(input)
    x = self.relu(x)
    x = self.hidden_layer(x)
    x = self.relu(x)
    x = self.sigmoid(x)
    output = self.output_layer(x)
    return output

  def configure_optimizers(self):
    params = self.parameters()
    optimizer = optim.Adam(params=params, lr = 0.01)
    return optimizer

  def training_step(self, batch, batch_idx):
    x, y = batch
    outputs = self(x) 
    loss = self.loss(outputs, y)
    train_accuracy = self.accuracy(outputs, y)
    self.log('train_accuracy', train_accuracy, prog_bar=True)
    self.log('train_loss', loss, prog_bar=True)
    return {"loss":loss, "train_accuracy":train_accuracy}
  
  def predict_step(self, batch, batch_idx, dataloader_idx=0):
        return self(batch)

In [11]:
model = Model()

trainer = pl.Trainer(max_epochs=1)

trainer.fit(model, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type           | Params
------------------------------------------------
0 | input_layer  | Linear         | 144   
1 | hidden_layer | Linear         | 136   
2 | output_layer | Linear         | 9     
3 | relu         | ReLU           | 0     
4 | sigmoid      | Sigmoid        | 0     
5 | loss         | MSELoss        | 0     
6 | accuracy     | BinaryAccuracy | 0     
------------------------------------------------
289       Trainable params
0         Non-trainable params
289       Total params
0.001     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [12]:
test = pd.read_csv("flight_delays_test.csv")
utils.clean_alpha(test)
utils.encode(test)
test

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,7,25,3,615,19,194,217,598
1,4,17,2,739,17,155,129,1235
2,12,2,7,651,11,121,208,577
3,3,25,7,1614,17,49,183,377
4,6,6,3,1505,15,209,270,258
...,...,...,...,...,...,...,...,...
99995,6,5,2,852,17,70,129,187
99996,11,24,6,1446,15,209,154,1515
99997,1,30,2,1509,14,209,254,438
99998,1,5,5,804,6,162,19,761


In [13]:
cols = test.columns.to_list()
test[cols] = test[cols].astype('int64')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   Month          100000 non-null  int64
 1   DayofMonth     100000 non-null  int64
 2   DayOfWeek      100000 non-null  int64
 3   DepTime        100000 non-null  int64
 4   UniqueCarrier  100000 non-null  int64
 5   Origin         100000 non-null  int64
 6   Dest           100000 non-null  int64
 7   Distance       100000 non-null  int64
dtypes: int64(8)
memory usage: 6.1 MB


In [18]:
test = scaler.fit_transform(test)
test.shape, type(test)

((100000, 8), numpy.ndarray)

In [19]:
test = torch.tensor(test, dtype=torch.float32)
test.shape, type(test)

(torch.Size([100000, 8]), torch.Tensor)

In [20]:
test_loader = DataLoader(test)

In [27]:
y_predicted = trainer.predict(model, test_loader)
y_predicted

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

[tensor([[0.0755]]),
 tensor([[0.0785]]),
 tensor([[0.0749]]),
 tensor([[0.2717]]),
 tensor([[0.1915]]),
 tensor([[0.0935]]),
 tensor([[0.0771]]),
 tensor([[0.2581]]),
 tensor([[0.1752]]),
 tensor([[0.2383]]),
 tensor([[0.2903]]),
 tensor([[0.2024]]),
 tensor([[0.1325]]),
 tensor([[0.2793]]),
 tensor([[0.0925]]),
 tensor([[0.2894]]),
 tensor([[0.0923]]),
 tensor([[0.3712]]),
 tensor([[0.4376]]),
 tensor([[0.0726]]),
 tensor([[0.1883]]),
 tensor([[0.0717]]),
 tensor([[0.2314]]),
 tensor([[0.1873]]),
 tensor([[0.3419]]),
 tensor([[0.3094]]),
 tensor([[0.0735]]),
 tensor([[0.0741]]),
 tensor([[0.3895]]),
 tensor([[0.2130]]),
 tensor([[0.1671]]),
 tensor([[0.2726]]),
 tensor([[0.0775]]),
 tensor([[0.3456]]),
 tensor([[0.4012]]),
 tensor([[0.2607]]),
 tensor([[0.2799]]),
 tensor([[0.1883]]),
 tensor([[0.1540]]),
 tensor([[0.2801]]),
 tensor([[0.1737]]),
 tensor([[0.2549]]),
 tensor([[0.2673]]),
 tensor([[0.3082]]),
 tensor([[0.2290]]),
 tensor([[0.2562]]),
 tensor([[0.0702]]),
 tensor([[0.1

In [28]:
import numpy as np

y_tensor= y_predicted.copy()



In [31]:
y_predicted

[tensor([[0.0755]]),
 tensor([[0.0785]]),
 tensor([[0.0749]]),
 tensor([[0.2717]]),
 tensor([[0.1915]]),
 tensor([[0.0935]]),
 tensor([[0.0771]]),
 tensor([[0.2581]]),
 tensor([[0.1752]]),
 tensor([[0.2383]]),
 tensor([[0.2903]]),
 tensor([[0.2024]]),
 tensor([[0.1325]]),
 tensor([[0.2793]]),
 tensor([[0.0925]]),
 tensor([[0.2894]]),
 tensor([[0.0923]]),
 tensor([[0.3712]]),
 tensor([[0.4376]]),
 tensor([[0.0726]]),
 tensor([[0.1883]]),
 tensor([[0.0717]]),
 tensor([[0.2314]]),
 tensor([[0.1873]]),
 tensor([[0.3419]]),
 tensor([[0.3094]]),
 tensor([[0.0735]]),
 tensor([[0.0741]]),
 tensor([[0.3895]]),
 tensor([[0.2130]]),
 tensor([[0.1671]]),
 tensor([[0.2726]]),
 tensor([[0.0775]]),
 tensor([[0.3456]]),
 tensor([[0.4012]]),
 tensor([[0.2607]]),
 tensor([[0.2799]]),
 tensor([[0.1883]]),
 tensor([[0.1540]]),
 tensor([[0.2801]]),
 tensor([[0.1737]]),
 tensor([[0.2549]]),
 tensor([[0.2673]]),
 tensor([[0.3082]]),
 tensor([[0.2290]]),
 tensor([[0.2562]]),
 tensor([[0.0702]]),
 tensor([[0.1

In [37]:
stacked_0 = torch.stack(y_predicted, dim=0)
b = stacked_0.numpy()
c = np.squeeze(b)
c

array([0.07554395, 0.07854979, 0.07489923, ..., 0.23425172, 0.08586088,
       0.097095  ], dtype=float32)

In [38]:
sample = pd.read_csv('sample_submission.csv')
submission = pd.DataFrame({'id': sample['id'], "dep_delayed_15min":c})
submission

Unnamed: 0,id,dep_delayed_15min
0,0,0.075544
1,1,0.078550
2,2,0.074899
3,3,0.271680
4,4,0.191496
...,...,...
99995,99995,0.091353
99996,99996,0.256069
99997,99997,0.234252
99998,99998,0.085861


In [41]:
submission.to_csv('submission.csv', index=False)