In [1]:
# ============================================================
# FULL MULTIVARIATE REGRESSION PREPROCESSING PIPELINE
# Loads → Cleans → Merges → Feature Engineering → Outliers →
# Encoding → Scaling → Save Output Dataset
# ============================================================

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
import joblib

# ============================================================
# 1. Load Raw Data
# ============================================================

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
weather = pd.read_csv("wetter.csv")
kiwo = pd.read_csv("kiwo.csv")

print("Loaded datasets:")
print("Train:", train.shape)
print("Test:", test.shape)
print("Weather:", weather.shape)
print("KiWo:", kiwo.shape)

# ============================================================
# 2. Fix Date Formats
# ============================================================

train["Datum"] = pd.to_datetime(train["Datum"])
test["Datum"] = pd.to_datetime(test["Datum"])
weather["Datum"] = pd.to_datetime(weather["Datum"])
kiwo["Datum"] = pd.to_datetime(kiwo["Datum"])

# ============================================================
# 3. Merge Datasets for EDA + Regression Preparation
# ============================================================

df = pd.merge(train, weather, on="Datum", how="left")
df = pd.merge(df, kiwo, on="Datum", how="left")

print("Merged Shape:", df.shape)

# ============================================================
# 4. Basic Feature Engineering (Time Features)
# ============================================================

df["Weekday"] = df["Datum"].dt.dayofweek   # 0 = Monday
df["Month"] = df["Datum"].dt.month
df["Year"] = df["Datum"].dt.year

# ============================================================
# 5. Missing Value Handling
# ============================================================

# Target cannot be missing
df = df.dropna(subset=["Umsatz"])

# Numerical features → median imputation
num_cols = ["Temperatur", "Bewoelkung", "Windgeschwindigkeit"]
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical → mode
df["Wettercode"] = df["Wettercode"].fillna(df["Wettercode"].mode()[0])

# Kieler Woche → 0 = no festival
df["KielerWoche"] = df["KielerWoche"].fillna(0)

print("Missing values after cleaning:")
print(df.isna().sum())

# ============================================================
# 6. Outlier Removal (Z-score filtering)
# ============================================================

z = np.abs(stats.zscore(df[["Umsatz", "Temperatur", "Windgeschwindigkeit"]]))
df = df[(z < 3).all(axis=1)]

print("Shape after outlier removal:", df.shape)

# ============================================================
# 7. Drop Non-Predictive Columns
# ============================================================

df_model = df.drop(columns=["id", "Datum", "Year"])

# ============================================================
# 8. Categorical Encoding
# ============================================================

df_model = pd.get_dummies(df_model, columns=["Warengruppe"], drop_first=True)

# ============================================================
# 9. Feature Scaling
# ============================================================

scale_cols = ["Temperatur", "Bewoelkung", "Windgeschwindigkeit"]
scaler = StandardScaler()
df_model[scale_cols] = scaler.fit_transform(df_model[scale_cols])

# ============================================================
# 10. Create Output Folder
# ============================================================

output_dir = "preprocessed_dataset"
os.makedirs(output_dir, exist_ok=True)

# ============================================================
# 11. Save Preprocessed Dataset
# ============================================================

output_file = os.path.join(output_dir, "multivariate_regression_data.csv")
df_model.to_csv(output_file, index=False)

print("=================================================")
print("Preprocessed dataset saved to:", output_file)

# ============================================================
# 12. Save Scaler for Future Modeling
# ============================================================

scaler_path = os.path.join(output_dir, "feature_scaler.pkl")
joblib.dump(scaler, scaler_path)

print("Feature scaler saved to:", scaler_path)
print("=================================================")

# ============================================================
# END OF SCRIPT
# ============================================================


Loaded datasets:
Train: (9334, 4)
Test: (1830, 3)
Weather: (2601, 5)
KiWo: (72, 2)
Merged Shape: (9334, 9)
Missing values after cleaning:
id                     0
Datum                  0
Warengruppe            0
Umsatz                 0
Bewoelkung             0
Temperatur             0
Windgeschwindigkeit    0
Wettercode             0
KielerWoche            0
Weekday                0
Month                  0
Year                   0
dtype: int64
Shape after outlier removal: (9113, 12)
Preprocessed dataset saved to: preprocessed_dataset/multivariate_regression_data.csv
Feature scaler saved to: preprocessed_dataset/feature_scaler.pkl
