In [5]:
import os
import pandas as pd
import numpy as np
import datetime

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import ttest_ind

In [7]:
from config import input_data_xlsx_filepath, input_data_csv_filepath, date_column, date_prefix_list, trained_models_folderpath, \
date_column, ad_platform_list, target_dict, train_size, feature_list, model_dict, save_val_end_points, n_trials_optuna, all_predictions_csv_filepath, \
all_validation_metrics_filepath, all_ranked_products_filepath, all_actual_products_selected_filepath, all_pred_products_selected_filepath, all_ab_test_results_filepath
from data_load import load_data
from data_preprocessing import preprocess_data
from feature_engineering import engineer_features
from model_training import set_train_test_val_size, train_model
from metrics_prediction import predict_metric
from model_revalidation import compute_metrics_by_category
from product_scoring import rank_products
from AB_testing import ab_test_selection

In [8]:
load_data(input_data_xlsx_filepath, input_data_csv_filepath)
input_df = pd.read_csv(input_data_csv_filepath)
input_df.set_index('product_id', inplace=True)
print('Preprocessing data...')
preprocessed_df = preprocess_data(input_df, date_column)
print('Feature engineering...')
featured_df = engineer_features(preprocessed_df, date_column, date_prefix_list, ad_platform_list)
val_first_day, val_last_day, val_last_day_str = save_val_end_points(featured_df.copy())

dummy_columns = [col for col in featured_df.columns if any(col.startswith(prefix) for prefix in date_prefix_list)]
features = feature_list + dummy_columns

predictions_df_list = []
for ad_platform in ad_platform_list:
    for target in target_dict[ad_platform]:            
        print('Model training...')
        orig_X_val, orig_y_val = set_train_test_val_size(featured_df, val_first_day, val_last_day, train_size, target, features, 
                                                         date_column, date_prefix_list, 
                                                         dummy_columns, encoding=False)        
        X_train, X_test, X_val, y_train, y_test, y_val, encoders, scalers = set_train_test_val_size(featured_df, val_first_day, 
                                                                                                    val_last_day, train_size, 
                                                                                                    target, features, 
                                                                                                    date_column, date_prefix_list, 
                                                                                                    dummy_columns, encoding=True)
        train_model(featured_df, ad_platform, target, features, val_last_day_str, 
                    trained_models_folderpath, X_train, X_test, y_train, y_test, X_val, y_val, encoders, scalers, model_dict, 
                    n_trials_optuna)
        print('Metrics prediction...')
        prediction_df = predict_metric(trained_models_folderpath, val_last_day_str, X_val, y_val, orig_X_val, target, ad_platform, model_dict)
        predictions_df_list.append(prediction_df)

all_predictions_df = pd.concat(predictions_df_list)
all_predictions_df.to_csv(all_predictions_csv_filepath, index=True)

print('Model Validation...')
metrics_df = compute_metrics_by_category(all_predictions_df, all_validation_metrics_filepath)

print('Product Scoring...')
all_ranked_products_df = rank_products(all_predictions_df, metrics_df, all_ranked_products_filepath)

print('AB testing')
all_actual_products_selected_df, all_pred_products_selected_df, all_ab_test_results_df = ab_test_selection(all_ranked_products_df, all_predictions_df,\
                                                                                                           all_actual_products_selected_filepath, \
                                                                                                           all_pred_products_selected_filepath, \
                                                                                                           all_ab_test_results_filepath)

Preprocessing data...
Feature engineering...
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
Model training...
<class 'pandas.core.frame.DataFrame'>
Index: 23798 entries, 000d864b-c092-4160-8eda-e9efe2f6f639 to fff05380-fae8-44b2-b2ea-f62969ac5706
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   category                       23798 non-null  object        
 1   sale_price                     23798 non-null  float64       
 2   discount_rate                  23798 non-null  float64       
 3   product_age                    23798 non-null  int64         
 4   product_status                 23798 non-null  object        
 5   pct_product_variants_in_stock  23798 non-null  float64       
 6   date                           23798 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 1.5+ MB
None


[I 2025-06-24 10:36:30,393] A new study created in memory with name: no-name-71d90a38-832f-41e8-9b16-96d0ca555298


Optimizing LinearRegression...


[I 2025-06-24 10:36:30,647] Trial 0 finished with value: 5.73082728421788e-09 and parameters: {}. Best is trial 0 with value: 5.73082728421788e-09.
[I 2025-06-24 10:36:30,824] A new study created in memory with name: no-name-62f628d6-5efb-479c-9934-d667fab03c3c


LinearRegression - Best MSE: 5.73082728421788e-09, R2: 1.0, Best Params: {}
Optimizing DecisionTree...


[I 2025-06-24 10:36:31,521] Trial 0 finished with value: 1.6981569375800987e+20 and parameters: {'max_depth': 10}. Best is trial 0 with value: 1.6981569375800987e+20.
[I 2025-06-24 10:36:32,224] A new study created in memory with name: no-name-c503a014-0276-41fc-bf19-5fa6207ec4a0


DecisionTree - Best MSE: 1.700314168861919e+20, R2: 0.8505718839015095, Best Params: {'max_depth': 10}
Optimizing KNeighbors...


[I 2025-06-24 10:36:45,130] Trial 0 finished with value: 1.3893309484819317e+19 and parameters: {'n_neighbors': 4}. Best is trial 0 with value: 1.3893309484819317e+19.


KNeighbors - Best MSE: 1.3893309484819317e+19, R2: 0.9877901913616387, Best Params: {'n_neighbors': 4}

Top 3 Models:
LinearRegression: MSE = 5.73082728421788e-09, R2 = 1.0
KNeighbors: MSE = 1.3893309484819317e+19, R2 = 0.9877901913616387
DecisionTree: MSE = 1.700314168861919e+20, R2 = 0.8505718839015095

Ensemble Weights:
LinearRegression: 0.35231586861844916
KNeighbors: 0.34801415928235985
DecisionTree: 0.299669972099191
Metrics prediction...
Model training...
<class 'pandas.core.frame.DataFrame'>
Index: 23798 entries, 000d864b-c092-4160-8eda-e9efe2f6f639 to fff05380-fae8-44b2-b2ea-f62969ac5706
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   category                       23798 non-null  object        
 1   sale_price                     23798 non-null  float64       
 2   discount_rate                  23798 non-null  float64       
 3   product_age             

[I 2025-06-24 10:37:25,168] A new study created in memory with name: no-name-a1e98f2d-1961-482b-899c-0b65da0abcb3
[I 2025-06-24 10:37:25,329] Trial 0 finished with value: 8.859090715106326e+21 and parameters: {}. Best is trial 0 with value: 8.859090715106326e+21.


Optimizing LinearRegression...


[I 2025-06-24 10:37:25,550] A new study created in memory with name: no-name-27705441-9a0d-4b08-85ca-b7ede67763d9


LinearRegression - Best MSE: 8.859090715106326e+21, R2: 0.022229087677207215, Best Params: {}
Optimizing DecisionTree...


[I 2025-06-24 10:37:26,402] Trial 0 finished with value: 9.782602005407278e+21 and parameters: {'max_depth': 10}. Best is trial 0 with value: 9.782602005407278e+21.
[I 2025-06-24 10:37:27,160] A new study created in memory with name: no-name-9d082edc-1eb7-4475-b20a-48500a40ac4b


DecisionTree - Best MSE: 9.965056547700913e+21, R2: -0.09983549613949294, Best Params: {'max_depth': 10}
Optimizing KNeighbors...


[I 2025-06-24 10:37:39,566] Trial 0 finished with value: 1.0220575392340912e+22 and parameters: {'n_neighbors': 4}. Best is trial 0 with value: 1.0220575392340912e+22.


KNeighbors - Best MSE: 1.0220575392340912e+22, R2: -0.12803691114626092, Best Params: {'n_neighbors': 4}

Top 3 Models:
LinearRegression: MSE = 8.859090715106326e+21, R2 = 0.022229087677207215
DecisionTree: MSE = 9.965056547700913e+21, R2 = -0.09983549613949294
KNeighbors: MSE = 1.0220575392340912e+22, R2 = -0.12803691114626092

Ensemble Weights:
LinearRegression: -0.10809535519812413
DecisionTree: 0.485478917231716
KNeighbors: 0.6226164379664081
Metrics prediction...
Model Validation...


TypeError: compute_metrics_by_category() takes 1 positional argument but 2 were given

In [None]:
# metrics_df = compute_metrics_by_category(all_predictions_df, all_validation_metrics_filepath)

In [None]:
# metrics_df

In [None]:
# all_ranked_products_df = rank_products(all_predictions_df, metrics_df, all_ranked_products_filepath)

In [None]:
# all_actual_products_selected_df, all_pred_products_selected_df, all_ab_test_results_df = ab_test_selection(all_ranked_products_df, all_predictions_df,\
#                                                                                                            all_actual_products_selected_filepath, \
#                                                                                                            all_pred_products_selected_filepath, \
#                                                                                                            all_ab_test_results_filepath)