In [83]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier

#test_url  = "https://raw.githubusercontent.com/ck2911/ML-Mini-Project/main/test.csv"
#test  = pd.read_csv(test_url, sep=";", low_memory=False)

In [84]:
train_url = "https://raw.githubusercontent.com/ck2911/ML-Mini-Project/main/train.csv"
df = pd.read_csv(train_url, sep=";", low_memory=False)

y = df["target90"].astype(int)
X = df.drop(columns=["target90", "customernumber"])

In [68]:
# Protect Dataset
X = X.copy()

# --------------------
# DATE FEATURES
# --------------------

# Convert date columns to datetime
date_cols = ["date", "datecreated", "deliverydatepromised", "deliverydatereal"]
for c in date_cols:
    X[c] = pd.to_datetime(X[c], errors="coerce")

# How long the customer existed before first order
X["account_age_days"] = (X["date"] - X["datecreated"]).dt.days

# Delivery delay in days (negative = early)
X["delivery_delay_days"] = (
    X["deliverydatereal"] - X["deliverydatepromised"]
).dt.days

# Binary flag: was delivery late at all?
X["delivery_late"] = (X["delivery_delay_days"] > 0).astype(int)

# --------------------
# ORDER QUALITY
# --------------------

# Items actually delivered to the customer
X["items_effective"] = X["numberitems"] - (X["cancel"] + X["remi"])

# % of cancelled items
X["cancel_ratio"] = X["cancel"] / X["numberitems"].replace(0, np.nan)

# % of remitted items
X["return_ratio"] = X["remi"] / X["numberitems"].replace(0, np.nan)

# --------------------
# PRODUCT MIX
# --------------------

# Columns w0â€“w10 encode product categories
w_cols = [f"w{i}" for i in range(11)]

# Number of distinct product categories ordered
X["num_categories"] = (X[w_cols] > 0).sum(axis=1)

# Digital content flag (ebooks or downloads)
X["has_download"] = ((X["w3"] + X["w5"]) > 0).astype(int)

# --------------------
# WEIGHT-RELATED
# --------------------

# Average shipment weight per item
X["avg_item_weight"] = X["weight"] / X["numberitems"].replace(0, np.nan)

# --------------------
# ADVERTISING
# --------------------

# Whether any advertising code was present
X["has_advertising"] = X["advertisingdatacode"].notna().astype(int)

# --------------------
# CLEANUP
# --------------------

# Drop raw date columns and raw advertising code
X = X.drop(columns=date_cols + ["advertisingdatacode"])

In [69]:
print(X["account_age_days"])



0        0
1        0
2        0
3        0
4        0
        ..
32423    0
32424    0
32425    0
32426    0
32427    0
Name: account_age_days, Length: 32428, dtype: int64
