In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import neural_network_library as nnl

dataset = np.load('nyc_taxi_data.npy', allow_pickle=True).item()
X_train, y_train, X_test, y_test = dataset['X_train'], dataset['y_train'], dataset['X_test'], dataset['y_test']

## Inspecting Data and Converting to Pandas DataFrame

In [2]:
print(X_train.keys())

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag'],
      dtype='object')


In [3]:
print(X_train.head())

                id  vendor_id      pickup_datetime     dropoff_datetime  \
879655   id2425795          1  2016-01-08 23:55:11  2016-01-09 00:04:32   
646838   id0767831          2  2016-03-05 09:52:06  2016-03-05 10:00:12   
1138713  id0449104          1  2016-04-09 16:03:53  2016-04-09 16:21:22   
864716   id3030157          1  2016-01-06 11:12:44  2016-01-06 11:19:49   
434927   id1584885          1  2016-06-26 09:10:56  2016-06-26 09:17:44   

         passenger_count  pickup_longitude  pickup_latitude  \
879655                 1        -73.955551        40.773346   
646838                 1        -73.962181        40.763599   
1138713                1        -73.977486        40.751842   
864716                 1        -73.970001        40.762363   
434927                 1        -73.950348        40.771561   

         dropoff_longitude  dropoff_latitude store_and_fwd_flag  
879655          -73.973640         40.763500                  N  
646838          -73.980377         40.

In [4]:
print(X_train.describe())

          vendor_id  passenger_count  pickup_longitude  pickup_latitude  \
count  1.312779e+06     1.312779e+06      1.312779e+06     1.312779e+06   
mean   1.534878e+00     1.664126e+00     -7.397350e+01     4.075093e+01   
std    4.987823e-01     1.313950e+00      7.351224e-02     3.291198e-02   
min    1.000000e+00     0.000000e+00     -1.219333e+02     3.435970e+01   
25%    1.000000e+00     1.000000e+00     -7.399187e+01     4.073735e+01   
50%    2.000000e+00     1.000000e+00     -7.398174e+01     4.075410e+01   
75%    2.000000e+00     2.000000e+00     -7.396734e+01     4.076835e+01   
max    2.000000e+00     9.000000e+00     -6.133553e+01     5.188108e+01   

       dropoff_longitude  dropoff_latitude  
count       1.312779e+06      1.312779e+06  
mean       -7.397342e+01      4.075181e+01  
std         7.316118e-02      3.579324e-02  
min        -1.219333e+02      3.218114e+01  
25%        -7.399133e+01      4.073589e+01  
50%        -7.397975e+01      4.075453e+01  
75%      

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1312779 entries, 879655 to 121958
Data columns (total 10 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1312779 non-null  object 
 1   vendor_id           1312779 non-null  int64  
 2   pickup_datetime     1312779 non-null  object 
 3   dropoff_datetime    1312779 non-null  object 
 4   passenger_count     1312779 non-null  int64  
 5   pickup_longitude    1312779 non-null  float64
 6   pickup_latitude     1312779 non-null  float64
 7   dropoff_longitude   1312779 non-null  float64
 8   dropoff_latitude    1312779 non-null  float64
 9   store_and_fwd_flag  1312779 non-null  object 
dtypes: float64(4), int64(2), object(4)
memory usage: 110.2+ MB


## Handle Missing Values

In [6]:
print('Missing values in training feature:\n', X_train.isnull().sum())

Missing values in training feature:
 id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
dtype: int64


## Feature Engineering

In [7]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c  # Distance in km

In [8]:
X_train['distance_km'] = haversine(X_train['pickup_latitude'], X_train['pickup_longitude'], X_train['dropoff_latitude'], X_train['dropoff_longitude'])
X_test['distance_km'] = haversine(X_test['pickup_latitude'], X_test['pickup_longitude'], X_test['dropoff_latitude'], X_test['dropoff_longitude'])

In [9]:
X_train['pickup_datetime'] = pd.to_datetime(X_train['pickup_datetime'])
X_train['pickup_hour'] = X_train['pickup_datetime'].dt.hour
X_train['pickup_dow'] = X_train['pickup_datetime'].dt.dayofweek
X_train['pickup_month'] = X_train['pickup_datetime'].dt.month


X_train["pickup_hour_sin"] = np.sin(2 * np.pi * X_train["pickup_hour"] / 24)
X_train["pickup_hour_cos"] = np.cos(2 * np.pi * X_train["pickup_hour"] / 24)
X_train["pickup_dow_sin"] = np.sin(2 * np.pi * X_train["pickup_dow"] / 7)
X_train["pickup_dow_cos"] = np.cos(2 * np.pi * X_train["pickup_dow"] / 7)
X_train['is_weekend'] = X_train['pickup_dow'].isin([5, 6]).astype(int)

X_test['pickup_datetime'] = pd.to_datetime(X_test['pickup_datetime'])
X_test['pickup_hour'] = X_test['pickup_datetime'].dt.hour
X_test['pickup_dow'] = X_test['pickup_datetime'].dt.dayofweek
X_test['pickup_month'] = X_test['pickup_datetime'].dt.month

X_test["pickup_hour_sin"] = np.sin(2 * np.pi * X_test["pickup_hour"] / 24)
X_test["pickup_hour_cos"] = np.cos(2 * np.pi * X_test["pickup_hour"] / 24)
X_test["pickup_dow_sin"] = np.sin(2 * np.pi * X_test["pickup_dow"] / 7)
X_test["pickup_dow_cos"] = np.cos(2 * np.pi * X_test["pickup_dow"] / 7)
X_test['is_weekend'] = X_test['pickup_dow'].isin([5, 6]).astype(int)


In [10]:
X_train = X_train.drop(['id', 'dropoff_datetime', 'pickup_datetime'], axis=1)
X_test = X_test.drop(['id', 'dropoff_datetime', 'pickup_datetime'], axis=1)

In [11]:
print(X_train['vendor_id'])

879655     1
646838     2
1138713    1
864716     1
434927     1
          ..
259178     2
1414414    1
131932     2
671155     1
121958     2
Name: vendor_id, Length: 1312779, dtype: int64


In [12]:
X_train["vendor_id"] = X_train["vendor_id"].astype(int)
X_train["store_and_fwd_flag"] = (X_train["store_and_fwd_flag"] == "Y").astype(int)

X_test["vendor_id"] = X_test["vendor_id"].astype(int)
X_test["store_and_fwd_flag"] = (X_test["store_and_fwd_flag"] == "Y").astype(int)

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_features = ['passenger_count', 'distance_km', 'pickup_month']

X_train[num_features] = scaler.fit_transform(X_train[num_features])

X_test[num_features] = scaler.transform(X_test[num_features])

In [14]:
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [15]:
final_features = [
    "vendor_id",
    "passenger_count",
    "store_and_fwd_flag",
    "distance_km",
    "pickup_hour_sin",
    "pickup_hour_cos",
    "pickup_dow_sin",
    "pickup_dow_cos",
    "is_weekend",
    "pickup_month",
    "pickup_latitude",
    "pickup_longitude",
]

X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

In [16]:
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_final, y_train_log, test_size=0.2, random_state=42
)

print("Training set shape:", X_train_split.shape)
print("Validation set shape:", X_val_split.shape)
print("Test set shape:", X_test_final.shape)

Training set shape: (1050223, 12)
Validation set shape: (262556, 12)
Test set shape: (145865, 12)


In [17]:
train_losses = []
val_losses = []

best_val_loss = np.inf
patience_counter = 0

model = nnl.Sequential(
    layers = [
    nnl.Linear(12, 64),
    nnl.ReLU(),
    nnl.Linear(64, 64),
    nnl.ReLU(),
    nnl.Linear(64, 1),
    nnl.Sigmoid()]
)

loss = nnl.MseLoss()
learning_rate = 0.01
n_epochs = 10
batch_size = 64
indices = np.arange(X_train_split.shape[0])

train_losses = []
val_losses = []

best_val_loss = np.inf
patience_counter = 0
patience = 3

num_samples = X_train_split.shape[0]
indices = np.arange(num_samples)

for epoch in range(n_epochs):
    # Shuffle training data each epoch
    np.random.shuffle(indices)

    # Mini-batch loop
    batch_losses = []
    for start in range(0, num_samples, batch_size):
        end = start + batch_size
        batch_idx = indices[start:end]
        X_batch = X_train_split.iloc[batch_idx]
        y_batch = y_train_split.iloc[batch_idx]

        y_pred = model.forward(X_batch)
        loss_value = loss.forward(y_pred, y_batch)
        batch_losses.append(loss_value)

        grad_loss = loss.backward()
        model.backward(grad_loss)

        for layer in model.layers:
            if isinstance(layer, nnl.Linear):
                layer.weights -= learning_rate * layer.grad_weights
                layer.bias -= learning_rate * layer.grad_bias

    train_loss_epoch = np.mean(batch_losses)
    train_losses.append(train_loss_epoch)

    y_val_split = y_val_split
    y_val_pred = model.forward(X_val_split)
    val_loss = loss.forward(y_val_pred, y_val_split)
    val_losses.append(val_loss)

    print(f'epcoh: {epoch}, train_loss: {train_loss_epoch}, val_loss: {val_loss}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print('Early stopping')
            exit()

AttributeError: 'Series' object has no attribute 'reshape'

In [None]:
print('Training finished')
print('Best validation loss:', best_val_loss)


In [None]:
print('losses graph')
plt.plot(train_losses, label='train loss')
plt.plot(val_losses, label='validation loss')
plt.legend()
