## Step 1: 

In [50]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

file_path = "nytaxi2022.csv"
output_path = "final_normalized.npy"

num_lines = sum(1 for _ in open(file_path, mode="rb")) - 1
target_cols = ["passenger_count","trip_distance","RatecodeID",
    "PULocationID","DOLocationID","payment_type","extra"]   # numeric cols to normalize
start_col = "tpep_pickup_datetime"
end_col = "tpep_dropoff_datetime"
TARGET = "total_amount"
remove_outliers_cols = ["trip_distance", "duration"]
chunksize = 100000

# ----------------------------------------------------------
# HELPER: Outlier filter
# ----------------------------------------------------------
def remove_outliers(df):
    return df[(df["total_amount"] > 0) &
        (df["total_amount"] < 300) &
        (df["trip_distance"] > 0) &
        (df["duration"] > 0) &
        (df["duration"] <= 180)]

# ---------- PASS 1: compute mean & std for target + duration ----------
all_numeric = target_cols + ["duration"]

count = 0
mean = np.zeros(len(all_numeric))
M2 = np.zeros(len(all_numeric))

for chunk in tqdm(pd.read_csv(
    file_path, 
    chunksize=chunksize, 
    usecols=target_cols + [start_col, end_col, TARGET], 
    parse_dates=[start_col, end_col],
    date_format="%m/%d/%Y %I:%M:%S %p",
), total=np.ceil(num_lines/chunksize), desc = "Computing mean and variance"):
    # duration in hours
    chunk["duration"] = (chunk[end_col] - chunk[start_col]).dt.total_seconds() / 3600.0
    
    # drop NaNs + outliers
    data = chunk.dropna(subset=all_numeric)
    data = remove_outliers(data)
    if data.empty:
        continue
    
    values = data[all_numeric].to_numpy()
    for row in values:
        count += 1
        delta = row - mean
        mean += delta / count
        delta2 = row - mean
        M2 += delta * delta2
    

variance = M2 / (count - 1)
std = np.sqrt(variance)

print("Valid rows after outlier removal:", count)
print("Mean:", mean)
print("Std:", std)

# ---------- PASS 2: normalize + cyclic + save ----------
n_features = len(all_numeric) + 6 # 6 cyclic features
shape = (count, n_features + 1) # +1 target value
fp = np.memmap(output_path, dtype="float32", mode="w+", shape=shape)

row_start = 0
for chunk in tqdm(pd.read_csv(
    file_path, 
    chunksize=chunksize, 
    usecols=target_cols + [start_col, end_col, TARGET], 
    parse_dates=[start_col, end_col],
    date_format="%m/%d/%Y %I:%M:%S %p",
), total=np.ceil(num_lines/chunksize), desc = f"Preprocessing and saving to {output_path}"):
    chunk["duration"] = (chunk[end_col] - chunk[start_col]).dt.total_seconds() / 3600.0
    
    # drop NaNs + outliers
    data = chunk.dropna(subset=all_numeric)
    data = remove_outliers(data)
    if data.empty:
        continue
    
    # cyclic features from start_time
    dt = data[start_col]
    dow, month, hour = dt.dt.dayofweek, dt.dt.month, dt.dt.hour

    cyclic_features = np.vstack([
        np.sin(2*np.pi*dow/7),  np.cos(2*np.pi*dow/7),
        np.sin(2*np.pi*(month-1)/12), np.cos(2*np.pi*(month-1)/12),
        np.sin(2*np.pi*hour/24), np.cos(2*np.pi*hour/24)
    ]).T
    
    # normalize numeric cols
    normed = ((data[all_numeric].to_numpy() - mean) / std).astype("float32") 
    
    # stack
    arr = np.hstack([normed, 
                     cyclic_features.astype("float32"), 
                     data[TARGET].values.reshape((-1,1))])
    
    n = len(arr)
    fp[row_start:row_start+n, :] = arr
    row_start += n

fp.flush()
print(shape)

Computing mean and variance:   0%|          | 0/397.0 [00:00<?, ?it/s]

Valid rows after outlier removal: 37560523
Mean: [  1.40264048   3.55686178   1.35107429 165.01859841 162.83035814
   1.21437838   1.05755672   0.28915475]
Std: [ 0.965228   53.85523056  5.30225853 64.84880916 70.09427412  0.43384368
  1.26935216  0.81894926]


Preprocessing and saving to final_normalized.npy:   0%|          | 0/397.0 [00:00<?, ?it/s]

(37560523, 15)


In [28]:
my_memmap_array = np.memmap("final_normalized.npy", dtype="float32", mode='r')
new = my_memmap_array.reshape((-1,13))
new.shape

(21730890, 13)