## Dependencies

In [93]:
# !pip install opendatasets

In [94]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from opendatasets import download

## Dataset

In [95]:
# download('https://www.kaggle.com/datasets/viveksharmar/flight-price-data', force=True)

In [96]:
df = pd.read_csv('flight-price-data/flight_dataset.csv')

## Data Engineering

In [97]:
#Date Change
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Day'] = df['Date'].dt.day
df = df.drop(['Date'], axis=1)
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Month,Year,Dep_hours,Dep_min,Arrival_hours,Arrival_min,Duration_hours,Duration_min,Day
0,IndiGo,Banglore,New Delhi,0,3897,1,1970,22,20,1,10,2,50,1
1,Air India,Kolkata,Banglore,2,7662,1,1970,5,50,13,15,7,25,1
2,Jet Airways,Delhi,Cochin,2,13882,1,1970,9,25,4,25,19,0,1
3,IndiGo,Kolkata,Banglore,1,6218,1,1970,18,5,23,30,5,25,1
4,IndiGo,Banglore,New Delhi,1,13302,1,1970,16,50,21,35,4,45,1


In [98]:
#Time Conversion
department_min = df['Dep_hours'] * 60 + df["Dep_min"]
arrival_min = df['Arrival_hours'] * 60 + df["Arrival_min"]
duration_min = df['Duration_hours'] * 60 + df["Duration_min"]  
print(department_min[0], arrival_min[0])
df = df.drop(['Arrival_hours', 'Dep_hours', 'Duration_hours'], axis=1)
df['Dep_min'] = department_min
df['Arrival_min'] = arrival_min
df['Duration_min'] = duration_min
df.head()

1340 70


Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Month,Year,Dep_min,Arrival_min,Duration_min,Day
0,IndiGo,Banglore,New Delhi,0,3897,1,1970,1340,70,170,1
1,Air India,Kolkata,Banglore,2,7662,1,1970,350,795,445,1
2,Jet Airways,Delhi,Cochin,2,13882,1,1970,565,265,1140,1
3,IndiGo,Kolkata,Banglore,1,6218,1,1970,1085,1410,325,1
4,IndiGo,Banglore,New Delhi,1,13302,1,1970,1010,1295,285,1


In [99]:
def cyclical_encoding(df, col):
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / 1440)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / 1440)

cyclical_encoding(df, 'Dep_min')
cyclical_encoding(df, 'Arrival_min')
df = df.drop(['Arrival_min', 'Dep_min', 'Duration_min'], axis=1)
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Month,Year,Day,Dep_min_sin,Dep_min_cos,Arrival_min_sin,Arrival_min_cos
0,IndiGo,Banglore,New Delhi,0,3897,1,1970,1,-0.422618,0.906308,0.300706,0.953717
1,Air India,Kolkata,Banglore,2,7662,1,1970,1,0.999048,0.043619,-0.321439,-0.94693
2,Jet Airways,Delhi,Cochin,2,13882,1,1970,1,0.625923,-0.779884,0.915311,0.402747
3,IndiGo,Kolkata,Banglore,1,6218,1,1970,1,-0.999762,0.021815,-0.130526,0.991445
4,IndiGo,Banglore,New Delhi,1,13302,1,1970,1,-0.953717,-0.300706,-0.59131,0.806445


In [100]:
#Encode categorical vars
le = LabelEncoder()
df['Airline'] = le.fit_transform(df['Airline'])
df['Source'] = le.fit_transform(df['Source'])
df['Destination'] = le.fit_transform(df['Destination'])
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Month,Year,Day,Dep_min_sin,Dep_min_cos,Arrival_min_sin,Arrival_min_cos
0,3,0,5,0,3897,1,1970,1,-0.422618,0.906308,0.300706,0.953717
1,1,3,0,2,7662,1,1970,1,0.999048,0.043619,-0.321439,-0.94693
2,4,2,1,2,13882,1,1970,1,0.625923,-0.779884,0.915311,0.402747
3,3,3,0,1,6218,1,1970,1,-0.999762,0.021815,-0.130526,0.991445
4,3,0,5,1,13302,1,1970,1,-0.953717,-0.300706,-0.59131,0.806445


## Train and Test

In [101]:
# Data and label split
X = df.drop('Price', axis=1)
y = df['Price']

In [102]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Month,Year,Day,Dep_min_sin,Dep_min_cos,Arrival_min_sin,Arrival_min_cos
8990,4,4,3,2,1,1970,1,0.991445,-0.130526,-0.932008,-0.362438
3684,4,2,1,1,1,1970,1,0.130526,-0.991445,-0.152123,-0.988362
1034,8,2,1,1,1,1970,1,-0.83147,-0.55557,-0.480989,0.876727
3909,6,2,1,1,1,1970,1,-0.21644,-0.976296,0.402747,0.915311
3088,1,2,1,2,1,1970,1,-0.980785,-0.19509,-0.94693,0.321439


In [103]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled[0]

array([ 0.01381155,  1.73659143,  1.05085563,  1.74432762,  0.        ,
        0.        ,  0.        ,  1.28399473,  0.11490272, -1.13172602,
       -0.56216383])

## Dataset and Dataloader Setup

In [104]:
class FlightDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y.values).reshape(-1, 1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [105]:
train_dataset = FlightDataset(X_train_scaled, y_train)
test_dataset = FlightDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

## NN model

In [106]:
class FlightPriceModel(nn.Module):
    def __init__(self, input_dim):
        super(FlightPriceModel, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.layer3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.layer4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.bn1(self.layer1(x)))
        x = self.relu(self.bn2(self.layer2(x)))
        x = self.relu(self.bn3(self.layer3(x)))
        x = self.layer4(x)
        return x

model = FlightPriceModel(X_train_scaled.shape[1])

## Training

In [107]:
import tqdm

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)

num_epochs = 300
best_val_loss = float('inf')
patience = 20
no_improve_epochs = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    train_loss /= len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            val_loss += criterion(outputs, batch_y).item()
    
    val_loss /= len(test_loader)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve_epochs = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        no_improve_epochs += 1
    
    if no_improve_epochs >= patience:
        print(f'Early stopping triggered after {epoch + 1} epochs')
        break

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))



Epoch [10/300], Train Loss: 100201237.4925, Val Loss: 100511425.8824
Epoch [20/300], Train Loss: 91556536.1791, Val Loss: 92211380.9412
Epoch [30/300], Train Loss: 80130229.1642, Val Loss: 80116008.0000
Epoch [40/300], Train Loss: 67171647.9701, Val Loss: 66971320.4706
Epoch [50/300], Train Loss: 53959685.8806, Val Loss: 54261596.4706
Epoch [60/300], Train Loss: 41425590.9104, Val Loss: 40024311.6471
Epoch [70/300], Train Loss: 30524389.0149, Val Loss: 30523015.4706
Epoch [80/300], Train Loss: 21610120.8731, Val Loss: 22496355.4118
Epoch [90/300], Train Loss: 15707778.4701, Val Loss: 16307275.6471
Epoch [100/300], Train Loss: 12075474.5410, Val Loss: 12716449.4118
Epoch [110/300], Train Loss: 11086536.8993, Val Loss: 11580123.5735
Epoch [120/300], Train Loss: 8852139.4235, Val Loss: 9436943.5588
Epoch [130/300], Train Loss: 7589912.2948, Val Loss: 8497903.0294
Epoch [140/300], Train Loss: 7771325.7463, Val Loss: 8558014.2574
Epoch [150/300], Train Loss: 7438318.9160, Val Loss: 8381787.

<All keys matched successfully>

## Evaluation of Model

In [108]:
# Make predictions
model.eval()
with torch.no_grad():
    y_pred = model(torch.FloatTensor(X_test_scaled))
    test_loss = criterion(y_pred, torch.FloatTensor(y_test.values).reshape(-1, 1))
    print(f'Test Loss: {test_loss.item():.4f}')

Test Loss: 8314479.5000


## Sample Predictions

In [109]:
sample_input = torch.FloatTensor(X_test_scaled[:5])
with torch.no_grad():
    sample_predictions = model(sample_input)
    print("Sample predictions:")
    print(sample_predictions.numpy())
    print("Actual prices:")
    print(y_test[:5].values)

Sample predictions:
[[13235.783]
 [ 7065.712]
 [11937.446]
 [ 4268.538]
 [11437.169]]
Actual prices:
[10844  4959 14781  3858 12898]
