<a href="https://colab.research.google.com/github/dannsb/pattern-recognition/blob/main/forecasting-retail-purchases.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost



In [None]:
!pip install scikit-learn==1.5.2



### Imports libs & load data

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

In [None]:
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train['time'] = pd.to_datetime(train['time'])
    test['time'] = pd.to_datetime(test['time'])
    return train, test

### Feature extraction

In [None]:
def feature_engineering(data):
    data['day_of_week'] = data['time'].dt.dayofweek
    data['month'] = data['time'].dt.month
    data['day'] = data['time'].dt.day
    data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)
    data['is_first_half_month'] = (data['day'] <= 15).astype(int)
    data['days_since_last_purchase'] = data.groupby('user')['time'].diff().dt.days.fillna(-1)

    return data

### Define modesl

In [None]:
def define_models():
    xgb_params = {
        'n_estimators': [100, 500],
        'max_depth': [6, 12],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.6, 0.8]
    }

    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [6, 10]
    }

    catboost_params = {
        'iterations': [500, 1000],
        'depth': [6, 10],
        'learning_rate': [0.01, 0.1]
    }


    xgb = RandomizedSearchCV(
        XGBClassifier(random_state=42, eval_metric='logloss'),
        xgb_params, n_iter=4, cv=3, n_jobs=-1
    )

    rf = RandomizedSearchCV(
        RandomForestClassifier(random_state=42),
        rf_params, n_iter=4, cv=3, n_jobs=-1
    )

    catboost = RandomizedSearchCV(
        CatBoostClassifier(random_state=42, verbose=0),
        catboost_params, n_iter=4, cv=3, n_jobs=-1
    )


    return [('XGB', xgb), ('RF', rf), ('CatBoost', catboost) ]

### define train and evaluate function

In [None]:
def train_and_evaluate(train, test, user_ids):
    scores = []
    results = []

    for user in user_ids:
        user_data = train[train['user'] == user]
        user_data = user_data[user_data['bought'].cumsum() > 0]


        X = user_data.drop(columns=['bought', 'user', 'time'], axis=1)
        y = user_data['bought']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        estimators = define_models()
        final_model = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier(random_state=42))
        final_model.fit(X_train, y_train)

        y_pred = final_model.predict(X_val)
        score = f1_score(y_val, y_pred)
        scores.append(score)

        print(f"User {user} - F1 Score: {score:.4f}")

        user_test = test[test['user'] == user]
        if not user_test.empty:
            X_test = user_test.drop(columns=['user', 'time'])

            predictions = final_model.predict(X_test)

            for idx, pred in enumerate(predictions):
                results.append({'ID': f"{user}_{user_test.iloc[idx]['time'].date()}", 'bought': pred})

    print(f"Average F1 Score: {np.mean(scores):.4f}")
    return results

### report and save submission

In [None]:

train_path = './train.csv'
test_path = './test.csv'

train, test = load_data(train_path, test_path)
train = feature_engineering(train)
test = feature_engineering(test)

train_users = train['user'].unique()
test_users = test['user'].unique()

submission = train_and_evaluate(train, test, train_users)

submission_df = pd.DataFrame(submission).drop_duplicates(subset=['ID'], keep='first')
submission_df.to_csv('submission.csv', index=False)


User 0 - F1 Score: 1.0000
User 1 - F1 Score: 0.3571
User 2 - F1 Score: 0.6471
User 3 - F1 Score: 0.0000
User 4 - F1 Score: 1.0000
User 5 - F1 Score: 0.7692
User 6 - F1 Score: 0.0000
User 7 - F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


User 8 - F1 Score: 0.0000
User 9 - F1 Score: 0.8852
User 10 - F1 Score: 0.0000
User 11 - F1 Score: 0.0000
User 12 - F1 Score: 0.6154
User 13 - F1 Score: 0.3750
User 14 - F1 Score: 0.6667
User 15 - F1 Score: 0.0000
User 16 - F1 Score: 0.8000
User 17 - F1 Score: 0.1818
User 18 - F1 Score: 0.5000
User 19 - F1 Score: 0.4444
User 20 - F1 Score: 0.1600
User 21 - F1 Score: 0.0000
User 22 - F1 Score: 0.0000
User 23 - F1 Score: 0.8276
User 24 - F1 Score: 0.9333
User 25 - F1 Score: 0.6875
User 26 - F1 Score: 0.0000
User 27 - F1 Score: 0.0000
User 28 - F1 Score: 0.7727
User 29 - F1 Score: 0.0000
User 30 - F1 Score: 0.0000
User 31 - F1 Score: 0.4167
User 32 - F1 Score: 0.0000
Average F1 Score: 0.3648
