In [1]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path: sys.path.insert(0, project_root)
requirements_path = os.path.join(project_root, 'requirements.txt')

In [2]:
import os
import torch
import numpy as np
import pandas as pd

import src.data_handling as data_handling
import src.model.torch_model as t


# paths
PRODUCTION_MODEL_FOLDER_PATH = 'models/production'
DFN_FILE_PATH = os.path.join(PRODUCTION_MODEL_FOLDER_PATH, 'dfn_best.pth')
GBM_FILE_PATH =  os.path.join(PRODUCTION_MODEL_FOLDER_PATH, 'gbm_best.pth')
EN_FILE_PATH = os.path.join(PRODUCTION_MODEL_FOLDER_PATH, 'en_best.pth')

PREPROCESSOR_PATH = 'preprocessors/column_transformer.pkl'

file_name = 'online_retail.csv'
file_path = os.path.join(project_root, 'data', 'raw', file_name)
df = pd.read_csv(file_path)
df = data_handling.scripts.sanitize_column_names(df=df)
df.head()


In [3]:
# feature engineering + imputation
df = data_handling.scripts.structure_missing_values(df=df)
df = data_handling.scripts.handle_feature_engineering(df=df)
df.head()

In [4]:
df.columns

In [5]:
from sklearn.model_selection import train_test_split

# classify num and cat columns
target_col = 'quantity'
num_cols, cat_cols = data_handling.scripts.categorize_num_cat_cols(df=df, target_col=target_col)
if cat_cols: 
    for col in cat_cols: df[col] = df[col].astype('string')


# creates train, val, test datasets
y = df[target_col]
X = df.copy().drop(target_col, axis='columns')
X.info()


In [6]:
test_size, random_state = 50000, 42
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=test_size, random_state=random_state, shuffle=True)
X_train.info()


In [7]:
X_train, X_val, X_test, preprocessor = data_handling.scripts.transform_input(X_train, X_val, X_test, num_cols=num_cols, cat_cols=cat_cols)


In [8]:
import src.model.torch_model as t

file_path = os.path.join(project_root, 'models', 'production', 'dfn_best.pth')
model = t.scripts.load_model(input_dim=X_train.shape[1], file_path=file_path)


In [9]:

stockcode = '85123A'

file_name = 'online_retail.csv'
file_path = os.path.join(project_root, 'data', 'raw', file_name)
df = pd.read_csv(file_path)
df = data_handling.scripts.sanitize_column_names(df=df)

df_stockcode = df[df['stockcode'] == stockcode]
print(df_stockcode['quantity'].unique())

df_stockcode = data_handling.scripts.structure_missing_values(df=df_stockcode)
df_stockcode = data_handling.scripts.handle_feature_engineering(df=df_stockcode)

print(df_stockcode['quantity'].unique())


In [10]:

X = df_stockcode.copy().drop(columns=target_col)
y = df_stockcode.copy()[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=random_state, shuffle=True)

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

batch_size = 32
train_data_loader = t.scripts.create_torch_data_loader(X=X_train, y=y_train, batch_size=batch_size)
val_data_loader = t.scripts.create_torch_data_loader(X=X_val, y=y_val, batch_size=batch_size)
# retrain the best model
model, _ = t.scripts.train_model(
    model=model,
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    criterion=torch.nn.MSELoss(),
    num_epochs=50,
    min_delta=0.00001,
    patience=10,
    train_data_loader=train_data_loader,
    val_data_loader=val_data_loader,
    device_type='cpu'
)

In [11]:
from src._utils import main_logger
import pandas as pd
import datetime

import src.model.torch_model as t

file_path = os.path.join(project_root, 'models', 'production', 'dfn_best.pth')
model = t.scripts.load_model(input_dim=X_train.shape[1], file_path=file_path)

min_price = 2
max_price = 6
NUM_PRICE_BINS = 1000
price_range = np.linspace(min_price, max_price, num=1000)
print(len(price_range))

# impute input data
new_data = {
    'invoicedate': [np.datetime64(datetime.datetime.now())] * NUM_PRICE_BINS,
    'invoiceno': [np.nan] * NUM_PRICE_BINS,
    'stockcode': [stockcode] * NUM_PRICE_BINS,
    'quantity': [0] * NUM_PRICE_BINS,
    'customerid': [np.nan] * NUM_PRICE_BINS,
    'country': ['United Kingdom'] * NUM_PRICE_BINS,
    'unitprice': price_range
}
new_df = pd.DataFrame(new_data)
new_df = data_handling.scripts.structure_missing_values(df=new_df)
new_df = data_handling.scripts.handle_feature_engineering(df=new_df)


# transform input data
target_col = 'quantity'
X = new_df.copy().drop(target_col, axis=1)
X = X.sample(frac=1).reset_index(drop=True)
# X = X.tail(NUM_PRICE_BINS)
print(X)


In [12]:

if preprocessor: X = preprocessor.transform(X)


model.eval()
input_tensor = torch.tensor(X, dtype=torch.float32)
epsilon = 1e-10
with torch.inference_mode():
    y_pred = model(input_tensor)
    y_pred = y_pred.cpu().numpy().flatten()
    y_pred_actual = np.exp(y_pred + epsilon)
    main_logger.info(f"primary model's prediction for stockcode {stockcode} - actual quantity (units) {y_pred_actual}")



In [13]:
df_ = new_df.copy()
df_['quantity'] = y_pred_actual
df_ = df_.sort_values(by='unitprice')

optimal_row = df_.loc[df_['quantity'].idxmax()]

optimal_price = optimal_row['unitprice']
best_sales = optimal_row['quantity'] * optimal_price

all_outputs = []
for _, row in df_.iterrows():
    current_output = {
        "stockcode": stockcode,
        "unit_price": float(row['unitprice']),
        "predicted_sales": float(row['quantity'] * row['unitprice']) * 30,
        "optimal_unit_price": float(optimal_price), # type: ignore
        "max_predicted_sales": float(best_sales) * 30, # type: ignore
    }
    all_outputs.append(current_output)

    # print(float(row['quantity'] * row['unitprice']))

print(optimal_price)