# LSTM-Ready Feature Engineering

Single-cell notebook. Edit `DATA_PATH` at the top to point to your CSV file (e.g., `data/working_copy.csv`).

In [1]:

# ---------------------------
# LSTM-ready feature engineering (single-cell)
# ---------------------------
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib
from datetime import timedelta

# Optional: PyTorch dataset (only if you use PyTorch)
try:
    import torch
    from torch.utils.data import Dataset
    TORCH_AVAILABLE = True
except Exception:
    TORCH_AVAILABLE = False

# ---------------------------
# PARAMETERS (edit as needed)
# ---------------------------
DATA_PATH = "C:/Users/Asus/Documents/Developers_Arena_Internship_Files/Month_6_Internship_files/traffic_flow_project/traffic_flow_project/data/synthetic_traffic_dataset.csv"
TIMESTAMP_COL = "timestamp"
SENSOR_COL = "sensor_id"              # if multiple sensors present
TARGET_COL = "vehicle_count"

# windowing
LOOKBACK = 12        # number of historical timesteps used as input (e.g., 12 * 5min = 60min if 5-min freq)
HORIZON = 1          # how many steps ahead to predict (1 for next-step)
STEP = 1             # stride while sliding window
RESAMPLE_RULE = "5T" # resample to 5-minute intervals; change to "1H" etc. if your data differs

SCALER_PATH = "model/scaler.joblib"

# Ensure model dir exists
os.makedirs("model", exist_ok=True)

# ---------------------------
# 1) Load + basic preprocessing
# ---------------------------
print("Loading data from:", DATA_PATH)
df = pd.read_csv(DATA_PATH, parse_dates=[TIMESTAMP_COL])
df = df.sort_values([SENSOR_COL, TIMESTAMP_COL]).reset_index(drop=True)
print("Loaded:", df.shape)
display(df.head())

# If timestamps are not regular, resample per sensor to a regular frequency (recommended for LSTM)
def resample_sensor(sensor_df, rule=RESAMPLE_RULE, ts_col=TIMESTAMP_COL):
    sensor_df = sensor_df.set_index(ts_col).sort_index()
    # keep numerical columns and forward/backfill small gaps
    numeric_cols = sensor_df.select_dtypes(include="number").columns.tolist()
    res = sensor_df[numeric_cols].resample(rule).mean()
    # small gap filling: forward-fill then backward-fill (tune as needed)
    res = res.ffill().bfill()
    res = res.reset_index()
    return res

# Apply per sensor (if you have only one sensor, this is still fine)
sensor_groups = []
for s, g in df.groupby(SENSOR_COL):
    g_res = resample_sensor(g, rule=RESAMPLE_RULE)
    g_res[SENSOR_COL] = s
    sensor_groups.append(g_res)
df_reg = pd.concat(sensor_groups, ignore_index=True).sort_values([SENSOR_COL, TIMESTAMP_COL])
print("After resampling:", df_reg.shape)
display(df_reg.head())

# ---------------------------
# 2) Time features (cyclical) + basic features
# ---------------------------
def add_time_features(df, ts_col=TIMESTAMP_COL):
    df["hour"] = df[ts_col].dt.hour
    df["minute"] = df[ts_col].dt.minute
    df["dayofweek"] = df[ts_col].dt.dayofweek  # Monday=0
    # cyclical encoding
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    df["dow_sin"] = np.sin(2 * np.pi * df["dayofweek"] / 7)
    df["dow_cos"] = np.cos(2 * np.pi * df["dayofweek"] / 7)
    return df

df_reg = add_time_features(df_reg)
display(df_reg.head())

# ---------------------------
# 3) Lag & rolling features
# ---------------------------
def create_lag_features(df, numeric_cols, lags=[1,2,3,4,6,12]):
    for lag in lags:
        for col in numeric_cols:
            df[f"{col}_lag{lag}"] = df.groupby(SENSOR_COL)[col].shift(lag)
    return df

def create_rolling_features(df, numeric_cols, windows=[3,6,12]):
    for w in windows:
        for col in numeric_cols:
            df[f"{col}_rmean_{w}"] = df.groupby(SENSOR_COL)[col].shift(1).rolling(w).mean().reset_index(level=0, drop=True)
            df[f"{col}_rstd_{w}"]  = df.groupby(SENSOR_COL)[col].shift(1).rolling(w).std().reset_index(level=0, drop=True)
    return df

numeric_cols = ["avg_speed", "occupancy", TARGET_COL]
df_reg = create_lag_features(df_reg, numeric_cols=numeric_cols, lags=[1,2,3,6,12])
df_reg = create_rolling_features(df_reg, numeric_cols=numeric_cols, windows=[3,6,12])

# ---------------------------
# 4) Drop rows with NaNs created by lagging
# ---------------------------
pre_drop = len(df_reg)
df_reg = df_reg.dropna().reset_index(drop=True)
post_drop = len(df_reg)
print(f"Dropped {pre_drop - post_drop} rows due to lag/rolling NaNs. Remaining: {post_drop}")

# ---------------------------
# 5) Feature selection & scaling
# ---------------------------
# Choose features for model input
feature_cols = [
    # time cyclical
    "hour_sin", "hour_cos", "dow_sin", "dow_cos",
    # numeric base features
    "avg_speed", "occupancy",
    # lag features
    "avg_speed_lag1","avg_speed_lag2","avg_speed_lag3",
    "occupancy_lag1","occupancy_lag2","occupancy_lag3",
    "vehicle_count_lag1","vehicle_count_lag2","vehicle_count_lag3",
    # rolling
    "vehicle_count_rmean_3", "vehicle_count_rstd_3"
]

# Ensure chosen columns exist (some may not if you changed lag windows)
feature_cols = [c for c in feature_cols if c in df_reg.columns]
print("Using feature cols:", feature_cols)

# Prepare target
df_reg["target"] = df_reg.groupby(SENSOR_COL)[TARGET_COL].shift(-HORIZON+1)  # for one-step ahead this is shift(0) but keeping pattern

# Drop any remaining NaNs
df_reg = df_reg.dropna(subset=feature_cols + ["target"]).reset_index(drop=True)
print("After final drop:", df_reg.shape)
display(df_reg.head())

# ---------------------------
# 6) Train/Val/Test split (time-based)
# ---------------------------
def train_val_test_split_time(df, test_days=7, val_days=7, ts_col=TIMESTAMP_COL):
    # Splits data by time (global) — good for time-series
    last_ts = df[ts_col].max()
    test_start = last_ts - pd.Timedelta(days=test_days)
    val_start = test_start - pd.Timedelta(days=val_days)
    train_df = df[df[ts_col] < val_start]
    val_df = df[(df[ts_col] >= val_start) & (df[ts_col] < test_start)]
    test_df = df[df[ts_col] >= test_start]
    return train_df, val_df, test_df

train_df, val_df, test_df = train_val_test_split_time(df_reg, test_days=7, val_days=7)
print("Train/Val/Test shapes:", train_df.shape, val_df.shape, test_df.shape)

# Fit scaler on train features only

# ---------------------------
# Robust Train/Val/Test split + Scaler (fallbacks)
# ---------------------------
from sklearn.preprocessing import StandardScaler
import joblib

# Parameters (you can keep or change)
TEST_DAYS = 7
VAL_DAYS = 7
SCALER_PATH = "model/scaler.joblib"

# Diagnostics
print("TOTAL rows:", len(df_reg))
print("Timestamp range:", df_reg[TIMESTAMP_COL].min(), "to", df_reg[TIMESTAMP_COL].max())
print("Unique sensors:", df_reg[SENSOR_COL].nunique())
print("Rows per sensor:\n", df_reg[SENSOR_COL].value_counts())

# 1) Try deterministic day-based split (original)
def split_by_days(df, test_days=TEST_DAYS, val_days=VAL_DAYS, ts_col=TIMESTAMP_COL):
    last_ts = df[ts_col].max()
    test_start = last_ts - pd.Timedelta(days=test_days)
    val_start = test_start - pd.Timedelta(days=val_days)
    train_df = df[df[ts_col] < val_start]
    val_df = df[(df[ts_col] >= val_start) & (df[ts_col] < test_start)]
    test_df = df[df[ts_col] >= test_start]
    return train_df, val_df, test_df

train_df, val_df, test_df = split_by_days(df_reg, TEST_DAYS, VAL_DAYS)
print("After day-split -> Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# 2) If TRAIN is empty (or too small), try a quantile time split fallback
if len(train_df) < 1:
    print("Day-split produced empty/too-small train set. Falling back to time-quantile split.")
    t1 = df_reg[TIMESTAMP_COL].quantile(0.70)  # train until 70% time
    t2 = df_reg[TIMESTAMP_COL].quantile(0.85)  # val until 85% time
    train_df = df_reg[df_reg[TIMESTAMP_COL] <= t1]
    val_df = df_reg[(df_reg[TIMESTAMP_COL] > t1) & (df_reg[TIMESTAMP_COL] <= t2)]
    test_df = df_reg[df_reg[TIMESTAMP_COL] > t2]
    print("After quantile-split -> Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# 3) If still small/empty, fallback to proportional index split (70/15/15)
if len(train_df) < 1 or len(val_df) < 1 or len(test_df) < 1:
    print("Quantile-split insufficient. Using proportional index-based split (70/15/15).")
    df_sorted = df_reg.sort_values(TIMESTAMP_COL).reset_index(drop=True)
    n = len(df_sorted)
    if n < 3:
        raise ValueError("Dataset too small for splitting. Need at least 3 rows.")
    i1 = int(n * 0.70)
    i2 = int(n * 0.85)
    train_df = df_sorted.iloc[:i1].copy()
    val_df = df_sorted.iloc[i1:i2].copy()
    test_df = df_sorted.iloc[i2:].copy()
    print("After proportional-split -> Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))

# Final safety check: ensure feature columns exist and we have samples
if len(train_df) == 0:
    raise ValueError("Training set is empty after all fallback splits. Check your timestamps and data size.")

# Make sure feature columns exist in data
present_features = [c for c in feature_cols if c in train_df.columns]
if len(present_features) == 0:
    raise ValueError(f"No feature columns present for scaling. Available cols in train: {train_df.columns.tolist()}")

print("Using feature columns for scaler:", present_features)

# Fit scaler on train features only
scaler = StandardScaler()
scaler.fit(train_df[present_features])
joblib.dump(scaler, SCALER_PATH)
print("Saved scaler to:", SCALER_PATH)

# Apply scaler to train/val/test for present features
for dname, d in [("train", train_df), ("val", val_df), ("test", test_df)]:
    if len(d) > 0:
        d[present_features] = scaler.transform(d[present_features])
        print(f"Scaled {dname} set: {len(d)} rows")
    else:
        print(f"{dname} set empty; skipping scaling.")

# ---------------------------
# 7) Sequence creation (sliding window)
# ---------------------------
def create_sequences_from_df(df, feature_columns, lookback=LOOKBACK, horizon=HORIZON, step=STEP):
    """
    Returns X: (n_samples, lookback, n_features), y: (n_samples, horizon)
    This function assumes df is sorted by [sensor, timestamp].
    We slide within each sensor independently to avoid mixing sensors in a window.
    """
    Xs, ys = [], []
    sensors = df[SENSOR_COL].unique()
    for s in sensors:
        s_df = df[df[SENSOR_COL]==s].sort_values(TIMESTAMP_COL)
        feature_vals = s_df[feature_columns].values
        target_vals = s_df["target"].values
        n = len(s_df)
        # sliding window
        for start in range(0, n - lookback - horizon + 1, step):
            end = start + lookback
            X = feature_vals[start:end]
            y = target_vals[end:end+horizon]  # shape (horizon,)
            Xs.append(X)
            ys.append(y)
    Xs = np.array(Xs)
    ys = np.array(ys)
    return Xs, ys

X_train, y_train = create_sequences_from_df(train_df, feature_cols, lookback=LOOKBACK, horizon=HORIZON, step=STEP)
X_val, y_val     = create_sequences_from_df(val_df, feature_cols, lookback=LOOKBACK, horizon=HORIZON, step=STEP)
X_test, y_test   = create_sequences_from_df(test_df, feature_cols, lookback=LOOKBACK, horizon=HORIZON, step=STEP)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

# If horizon==1, reshape y to (n_samples,) for convenience
if y_train.ndim == 2 and y_train.shape[1] == 1:
    y_train = y_train.ravel()
    y_val = y_val.ravel()
    y_test = y_test.ravel()
    print("Reshaped targets to 1D arrays")

# Save arrays for training reuse
np.save("model/X_train.npy", X_train)
np.save("model/y_train.npy", y_train)
np.save("model/X_val.npy", X_val)
np.save("model/y_val.npy", y_val)
np.save("model/X_test.npy", X_test)
np.save("model/y_test.npy", y_test)
print("Saved numpy arrays to model/")

# ---------------------------
# 8) Example Keras-ready shapes and quick model (TensorFlow/Keras)
# ---------------------------
n_timesteps = X_train.shape[1] if X_train.size else LOOKBACK
n_features  = X_train.shape[2] if X_train.size else len(feature_cols)
print("Timesteps, features:", n_timesteps, n_features)

keras_example = """ 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential([
    LSTM(64, input_shape=(n_timesteps, n_features), return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # for horizon==1; for multi-step, Dense(horizon)
])
model.compile(optimizer=Adam(1e-3), loss='mse', metrics=['mae'])
model.summary()
# model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=64)
"""
print("Keras example (commented):", keras_example)

# ---------------------------
# 9) Optional: PyTorch Dataset wrapper
# ---------------------------
if TORCH_AVAILABLE:
    class TimeSeriesDataset(Dataset):
        def __init__(self, X, y, device=None):
            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = torch.tensor(y, dtype=torch.float32)
            self.device = device
        def __len__(self):
            return len(self.X)
        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]
    # Example usage:
    # train_ds = TimeSeriesDataset(X_train, y_train)
    # from torch.utils.data import DataLoader
    # train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    print("PyTorch is available — TimeSeriesDataset class created.")
else:
    print("PyTorch not available in this environment; skip PyTorch dataset creation.")

# ---------------------------
# DONE
# ---------------------------
print("Feature engineering complete. Ready for LSTM training.")

Loading data from: C:/Users/Asus/Documents/Developers_Arena_Internship_Files/Month_6_Internship_files/traffic_flow_project/traffic_flow_project/data/synthetic_traffic_dataset.csv
Loaded: (6048, 5)


Unnamed: 0,timestamp,sensor_id,vehicle_count,avg_speed,occupancy
0,2023-01-01 00:00:00,1,2,62.91,0.02
1,2023-01-01 00:05:00,1,13,62.51,0.13
2,2023-01-01 00:10:00,1,11,59.9,0.11
3,2023-01-01 00:15:00,1,9,60.32,0.09
4,2023-01-01 00:20:00,1,6,60.81,0.06


After resampling: (6048, 5)


  res = sensor_df[numeric_cols].resample(rule).mean()
  res = sensor_df[numeric_cols].resample(rule).mean()
  res = sensor_df[numeric_cols].resample(rule).mean()


Unnamed: 0,timestamp,sensor_id,vehicle_count,avg_speed,occupancy
0,2023-01-01 00:00:00,1,2.0,62.91,0.02
1,2023-01-01 00:05:00,1,13.0,62.51,0.13
2,2023-01-01 00:10:00,1,11.0,59.9,0.11
3,2023-01-01 00:15:00,1,9.0,60.32,0.09
4,2023-01-01 00:20:00,1,6.0,60.81,0.06


Unnamed: 0,timestamp,sensor_id,vehicle_count,avg_speed,occupancy,hour,minute,dayofweek,hour_sin,hour_cos,dow_sin,dow_cos
0,2023-01-01 00:00:00,1,2.0,62.91,0.02,0,0,6,0.0,1.0,-0.781831,0.62349
1,2023-01-01 00:05:00,1,13.0,62.51,0.13,0,5,6,0.0,1.0,-0.781831,0.62349
2,2023-01-01 00:10:00,1,11.0,59.9,0.11,0,10,6,0.0,1.0,-0.781831,0.62349
3,2023-01-01 00:15:00,1,9.0,60.32,0.09,0,15,6,0.0,1.0,-0.781831,0.62349
4,2023-01-01 00:20:00,1,6.0,60.81,0.06,0,20,6,0.0,1.0,-0.781831,0.62349


Dropped 36 rows due to lag/rolling NaNs. Remaining: 6012
Using feature cols: ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'avg_speed', 'occupancy', 'avg_speed_lag1', 'avg_speed_lag2', 'avg_speed_lag3', 'occupancy_lag1', 'occupancy_lag2', 'occupancy_lag3', 'vehicle_count_lag1', 'vehicle_count_lag2', 'vehicle_count_lag3', 'vehicle_count_rmean_3', 'vehicle_count_rstd_3']
After final drop: (6012, 46)


Unnamed: 0,timestamp,sensor_id,vehicle_count,avg_speed,occupancy,hour,minute,dayofweek,hour_sin,hour_cos,...,occupancy_rstd_6,vehicle_count_rmean_6,vehicle_count_rstd_6,avg_speed_rmean_12,avg_speed_rstd_12,occupancy_rmean_12,occupancy_rstd_12,vehicle_count_rmean_12,vehicle_count_rstd_12,target
0,2023-01-01 01:00:00,1,14.0,65.24,0.14,1,0,6,0.258819,0.965926,...,0.022286,9.166667,2.228602,61.99,1.704145,0.0875,0.030488,8.75,3.048845,14.0
1,2023-01-01 01:05:00,1,9.0,60.85,0.09,1,5,6,0.258819,0.965926,...,0.028577,10.166667,2.857738,62.184167,1.935527,0.0975,0.025628,9.75,2.562846,9.0
2,2023-01-01 01:10:00,1,14.0,65.18,0.14,1,10,6,0.258819,0.965926,...,0.027325,10.333333,2.73252,62.045833,1.969151,0.094167,0.023533,9.416667,2.35327,14.0
3,2023-01-01 01:15:00,1,7.0,62.22,0.07,1,15,6,0.258819,0.965926,...,0.028048,11.333333,2.804758,62.485833,2.034884,0.096667,0.026742,9.666667,2.674232,7.0
4,2023-01-01 01:20:00,1,5.0,61.25,0.05,1,20,6,0.258819,0.965926,...,0.028048,11.333333,2.804758,62.644167,1.921819,0.095,0.027798,9.5,2.779797,5.0


Train/Val/Test shapes: (0, 46) (0, 46) (6012, 46)
TOTAL rows: 6012
Timestamp range: 2023-01-01 01:00:00 to 2023-01-07 23:55:00
Unique sensors: 3
Rows per sensor:
 sensor_id
1    2004
2    2004
3    2004
Name: count, dtype: int64
After day-split -> Train/Val/Test sizes: 0 0 6012
Day-split produced empty/too-small train set. Falling back to time-quantile split.
After quantile-split -> Train/Val/Test sizes: 4209 903 900
Using feature columns for scaler: ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'avg_speed', 'occupancy', 'avg_speed_lag1', 'avg_speed_lag2', 'avg_speed_lag3', 'occupancy_lag1', 'occupancy_lag2', 'occupancy_lag3', 'vehicle_count_lag1', 'vehicle_count_lag2', 'vehicle_count_lag3', 'vehicle_count_rmean_3', 'vehicle_count_rstd_3']
Saved scaler to: model/scaler.joblib
Scaled train set: 4209 rows
Scaled val set: 903 rows
Scaled test set: 900 rows
X_train shape: (4173, 12, 17)
y_train shape: (4173, 1)
X_val shape: (867, 12, 17)
X_test shape: (864, 12, 17)
Reshaped targets to 1D 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[present_features] = scaler.transform(d[present_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[present_features] = scaler.transform(d[present_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[present_features] = scaler.transform(d[present_features])


# Modeling

Feature engineering and training example (RandomForest + LSTM outline).

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


df = pd.read_csv(DATA_PATH, parse_dates=['timestamp'])
df['hour'] = df['timestamp'].dt.hour
X = df[['sensor_id','hour','avg_speed','occupancy']].fillna(0)
y = df['vehicle_count']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, pred))
# Save model
import joblib
import os
os.makedirs('model', exist_ok=True)
joblib.dump(model, 'model/joblib_model.joblib')
print('Saved model to model/joblib_model.joblib')

MAE: 0.0022975206611570305
Saved model to model/joblib_model.joblib
