In [1]:
import pandas as pd
from datetime import timedelta
import holidays
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import TomekLinks
import numpy as np

In [2]:
data = pd.read_csv("sales_assortments.csv", index_col="Index")
data.sample(5)

Unnamed: 0_level_0,Date,Sales,Assortment
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
691,2018 aug 29,155425.926411,drink
842,2018 mar 02,6982.005766,savory_snacks
500,2017 mar 20,23749.154835,savory_snacks
1193,2019 mar 18,7795.688284,savory_snacks
888,2018 may 17,10404.772239,candy


In [3]:
data.Date = pd.to_datetime(data.Date, format="%Y %b %d")
min_year = data["Date"].min().year
max_year = data["Date"].max().year

year_range = range(min_year, max_year + 1)
us_holidays = holidays.US(years=year_range)


def check_next_week_holiday(date):
    for i in range(1, 8):
        if (date + timedelta(days=i)) in us_holidays:
            return 1
    return 0

In [4]:
data["DayOfWeek"] = data["Date"].dt.dayofweek
data["IsWeekend"] = data["DayOfWeek"].isin([5, 6]).astype(int)
data["Month"] = data["Date"].dt.month
data["IsRealHoliday"] = data["Date"].apply(lambda date: int(date in us_holidays))
data["IsHolidayNextWeek"] = data["Date"].apply(check_next_week_holiday)

In [5]:
data.sample(5)

Unnamed: 0_level_0,Date,Sales,Assortment,DayOfWeek,IsWeekend,Month,IsRealHoliday,IsHolidayNextWeek
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1324,2019-09-20,5206.130645,drink,4,0,9,0,0
916,2018-11-14,62272.783771,savory_snacks,2,0,11,0,0
387,2017-02-25,3747.709538,savory_snacks,5,1,2,0,0
501,2017-03-21,225577.474303,savory_snacks,1,0,3,0,0
818,2018-06-07,6987.707146,candy,3,0,6,0,0


In [6]:
n_bins = 3
labels = ["low", "medium", "high"]

try:
    data["SalesClass"] = pd.qcut(data["Sales"], q=n_bins, labels=labels)
except ValueError:
    data["SalesClass"] = pd.cut(data["Sales"], bins=n_bins, labels=labels)

data["SalesClassCode"] = data["SalesClass"].astype("category").cat.codes

print(data["SalesClass"].value_counts().sort_index())
data[["Sales", "SalesClass", "SalesClassCode"]].sample(5)

SalesClass
low       567
medium    566
high      567
Name: count, dtype: int64


Unnamed: 0_level_0,Sales,SalesClass,SalesClassCode
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
886,14829.822384,medium,1
725,73436.480838,high,2
585,9040.340778,medium,1
656,29981.607276,medium,1
1300,35747.56564,medium,1


In [7]:
data.sample(5)

Unnamed: 0_level_0,Date,Sales,Assortment,DayOfWeek,IsWeekend,Month,IsRealHoliday,IsHolidayNextWeek,SalesClass,SalesClassCode
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1234,2019-11-13,226870.709217,savory_snacks,2,0,11,0,0,high,2
976,2018-09-13,173257.8342,drink,3,0,9,0,0,high,2
430,2017-07-09,3849.863711,candy,6,1,7,0,0,low,0
1343,2020-04-09,75926.281935,candy,3,0,4,0,0,high,2
1572,2020-03-26,41775.942946,candy,3,0,3,0,0,medium,1


In [8]:
y = data["SalesClassCode"]

X = data.drop(columns=["Date", "Sales", "SalesClass", "SalesClassCode"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
X_train[["Assortment"]].shape, y_train.shape

((1360, 1), (1360,))

In [10]:
te = TargetEncoder()
X_train["Assortment"] = te.fit_transform(X_train[["Assortment"]], y_train)
X_test["Assortment"] = te.transform(X_test[["Assortment"]])

In [11]:
numeric_cols = X_train.select_dtypes(include=["number"]).columns.tolist()

scaler = StandardScaler()
scaler.fit(X_train[numeric_cols])

X_train[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [12]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
clf.fit(X_train, y_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

print(f"Train accuracy: {acc_train:.4f}")
print(f"Test accuracy:  {acc_test:.4f}")
print("\nClassification report (test):")
print(classification_report(y_test, y_pred_test, target_names=labels))

Train accuracy: 0.6199
Test accuracy:  0.5618

Classification report (test):
              precision    recall  f1-score   support

         low       0.59      0.56      0.57       100
      medium       0.48      0.40      0.43       121
        high       0.60      0.73      0.66       119

    accuracy                           0.56       340
   macro avg       0.56      0.56      0.56       340
weighted avg       0.55      0.56      0.55       340



In [23]:
train_counts = y_train.value_counts()
print("Train class counts before manual downsample:", train_counts.to_dict())

max_count = train_counts.max()
classes_sorted = train_counts.sort_values(ascending=False)
if len(classes_sorted) > 1:
    class_to_reduce = classes_sorted.index[1]
else:
    class_to_reduce = classes_sorted.index[0]

target_count = max(1, int(0.5 * max_count))
idx_of_class = y_train[y_train == class_to_reduce].index
np.random.seed(42)
reduced_idx = np.random.choice(idx_of_class, size=min(target_count, len(idx_of_class)), replace=False)
keep_idx = y_train[y_train != class_to_reduce].index.union(reduced_idx)
X_train_manual = X_train.loc[keep_idx].copy()
y_train_manual = y_train.loc[keep_idx].copy()

print(f"Reduced class {class_to_reduce} to {y_train_manual.value_counts().get(class_to_reduce,0)} (target {target_count})")
print("Train class counts after manual downsample:", y_train_manual.value_counts().to_dict())

Train class counts before manual downsample: {0: 467, 2: 448, 1: 445}
Reduced class 2 to 233 (target 233)
Train class counts after manual downsample: {0: 467, 1: 445, 2: 233}


In [24]:
clf_manual = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
clf_manual.fit(X_train_manual, y_train_manual)
y_pred_manual = clf_manual.predict(X_test)
acc_manual = accuracy_score(y_test, y_pred_manual)
print(f"Accuracy (manual downsample train new clf): {acc_manual}")

Accuracy (manual downsample train new clf): 0.5


In [25]:
results = {}

methods = {
    "RandomOverSampler": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "ADASYN": ADASYN(random_state=42),
    "TomekLinks": TomekLinks(),
}

In [26]:
for name, sampler in methods.items():
    try:
        X_res, y_res = sampler.fit_resample(X_train, y_train)
    except ValueError as e:
        print(f"{name} skipped: {e}")
        results[name] = None
        continue

    print(f"{name}: resampled counts {Counter(y_res)}")

    clf_res = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    clf_res.fit(X_res, y_res)
    y_pred = clf_res.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} accuracy: {acc:.4f}")

RandomOverSampler: resampled counts Counter({2: 467, 1: 467, 0: 467})
RandomOverSampler accuracy: 0.5706
SMOTE: resampled counts Counter({2: 467, 1: 467, 0: 467})
SMOTE accuracy: 0.5500
ADASYN skipped: No samples will be generated with the provided ratio settings.
TomekLinks: resampled counts Counter({1: 445, 0: 445, 2: 428})
TomekLinks accuracy: 0.5618


In [27]:
results["original_train"] = acc_test
results["manual_downsample"] = acc_manual

print("\nSummary of accuracies:")
for k, v in results.items():
    print(f"{k}: {v}")


Summary of accuracies:
RandomOverSampler: 0.5705882352941176
SMOTE: 0.55
ADASYN: None
TomekLinks: 0.5617647058823529
original_train: 0.5617647058823529
manual_downsample: 0.5
