In [17]:
import zipfile
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

import utils 
    
    
train = pd.read_csv("flight_delays_train.csv")
train

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y
...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N


In [18]:
utils.clean_alpha(train)
utils.encode(train)

train

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,8,21,7,1934,0,18,78,732,N
1,4,20,3,1548,18,217,171,834,N
2,9,2,5,1422,20,228,59,416,N
3,11,25,6,1015,15,78,175,872,N
4,10,7,6,1828,19,174,199,423,Y
...,...,...,...,...,...,...,...,...,...
99995,5,4,3,1618,15,246,224,199,N
99996,1,18,3,804,4,92,72,884,N
99997,1,24,2,1901,13,85,131,1076,N
99998,4,27,4,1515,12,79,107,140,N


In [19]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

X.shape, y.shape

((100000, 8), (100000,))

In [20]:
cols = X.columns.to_list()
X[cols] = X[cols].astype('int64')
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   Month          100000 non-null  int64
 1   DayofMonth     100000 non-null  int64
 2   DayOfWeek      100000 non-null  int64
 3   DepTime        100000 non-null  int64
 4   UniqueCarrier  100000 non-null  int64
 5   Origin         100000 non-null  int64
 6   Dest           100000 non-null  int64
 7   Distance       100000 non-null  int64
dtypes: int64(8)
memory usage: 6.1 MB


In [21]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X = scaler.fit_transform(X)

X[0], type(X)

(array([ 0.42841367,  0.6022536 ,  1.53049018,  1.24371514, -1.83438959,
        -1.61742586, -0.82102413,  0.00452972]),
 numpy.ndarray)

In [22]:
y = y.map({'N':0, 'Y':1})
y = y.to_numpy(dtype=int)
y

array([0, 0, 0, ..., 0, 0, 0])

### Lightning

In [27]:
import pytorch_lightning as pl
import torchvision
import torch
from torch.utils.data import DataLoader
import torchmetrics
import torch.nn as nn
import torch.optim as optim


In [25]:
# convert into PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
 
# train-test split for evaluation of the model
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)


(torch.Size([70000, 8]),
 torch.Size([30000, 8]),
 torch.Size([70000, 1]),
 torch.Size([30000, 1]))

In [36]:
train_loader = DataLoader(list(zip(X_train,y_train)), shuffle=True, batch_size=16,)
#test_loader = DataLoader(list(zip(X_test,y_test)), shuffle=True, batch_size=16)
test_loader = DataLoader(X_test, shuffle=True, batch_size=16)

In [33]:


class Model(pl.LightningModule):
  def __init__(self):

    super(Model,self).__init__()
    
    self.input_layer = nn.Linear(8, 16)
    self.hidden_layer = nn.Linear(16, 8)
    self.output_layer = nn.Linear(8,1)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.loss = nn.MSELoss()
    self.accuracy = torchmetrics.Accuracy(task="binary")

  def forward(self, input):
    x = self.input_layer(input)
    x = self.relu(x)
    x = self.hidden_layer(x)
    x = self.relu(x)
    x = self.sigmoid(x)
    output = self.output_layer(x)
    return output

  def configure_optimizers(self):
    params = self.parameters()
    optimizer = optim.Adam(params=params, lr = 0.01)
    return optimizer

  def training_step(self, batch, batch_idx):
    x, y = batch
    outputs = self(x) 
    loss = self.loss(outputs, y)
    train_accuracy = self.accuracy(outputs, y)
    self.log('train_accuracy', train_accuracy, prog_bar=True)
    self.log('train_loss', loss, prog_bar=True)
    return {"loss":loss, "train_accuracy":train_accuracy}
  
  def predict_step(self, batch, batch_idx, dataloader_idx=0):
        return self(batch)

In [34]:
model = Model()

trainer = pl.Trainer(max_epochs=1)

trainer.fit(model, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type           | Params
------------------------------------------------
0 | input_layer  | Linear         | 144   
1 | hidden_layer | Linear         | 136   
2 | output_layer | Linear         | 9     
3 | relu         | ReLU           | 0     
4 | sigmoid      | Sigmoid        | 0     
5 | loss         | MSELoss        | 0     
6 | accuracy     | BinaryAccuracy | 0     
------------------------------------------------
289       Trainable params
0         Non-trainable params
289       Total params
0.001     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [37]:
trainer.predict(model, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

[tensor([[0.0904],
         [0.0959],
         [0.3386],
         [0.4407],
         [0.1251],
         [0.0930],
         [0.1291],
         [0.4554],
         [0.2309],
         [0.1607],
         [0.4605],
         [0.1966],
         [0.1910],
         [0.1058],
         [0.0984],
         [0.0902]]),
 tensor([[0.2288],
         [0.2718],
         [0.3787],
         [0.4148],
         [0.2241],
         [0.3980],
         [0.0876],
         [0.2288],
         [0.3303],
         [0.2288],
         [0.3463],
         [0.1562],
         [0.0903],
         [0.2793],
         [0.0959],
         [0.0922]]),
 tensor([[0.2384],
         [0.1431],
         [0.1051],
         [0.4705],
         [0.2288],
         [0.0884],
         [0.1110],
         [0.0931],
         [0.2054],
         [0.1697],
         [0.4479],
         [0.4543],
         [0.2571],
         [0.0896],
         [0.1593],
         [0.4433]]),
 tensor([[0.2288],
         [0.2109],
         [0.4679],
         [0.1053],
      