# Import

In [None]:
# Standard Imports
import os
import pandas as pd
import numpy as np
import time
import sys
import gc
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings("ignore")

# Modelling
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
#%pip install lightgbm
import lightgbm as lgb
from sklearn.model_selection import (train_test_split, GridSearchCV)
import statsmodels.api as sm

#basic tools 
import os
import numpy as np
import pandas as pd
import warnings

#tuning hyperparameters
from bayes_opt import BayesianOptimization
from skopt  import BayesSearchCV 
#%pip install hgboost
from hgboost import hgboost
from hyperopt import hp
from skopt import gp_minimize

# Visualisation
import plotly.express as px
import plotly.graph_objects as go

# Src modules -- Update the * to get only what I need
sys.path.append(os.path.dirname(os.getcwd())) # Add the parent directory to the Python path so we can import src modules
from src.data_setup import *
from src.decision_tree import *
from src.model_evaluation import model_eval_pipeline,calc_root_mean_squared_error,calc_root_mean_squared_log_error,rmsle_func,rmsle_lgbm
from src.visualisation import *
from src.model_utils import *



# Load Data

In [None]:
# load train,val and test data
#train, test, stores, transactions = get_data()

#data = get_oil_holiday_data()
#df_data=Transform_Data_For_DT(data,60,True)
#df_data_feats=DT_features(df_data,False)
#df_data.to_pickle(DATA_PATH/'processed/DT.pkl')
#df_data_feats.to_pickle(DATA_PATH/'processed/DT_Features.pkl')
#data.to_pickle(DATA_PATH/'processed/Data.pkl')

df_data=pickle.load(open(DATA_PATH/'processed/DT.pkl','rb'))
df_data_feats=pickle.load(open(DATA_PATH/'processed/DT_Features.pkl','rb'))
data=pickle.load(open(DATA_PATH/'processed/Data.pkl','rb'))

# Data Processing

#### Create main dataframes

In [None]:
train = data[data['is_test']==False]
train=train.drop(columns={'is_test'})
test = data[data['is_test']==True]
test=test.drop(columns={'is_test'})

df_data_feats = df_data_feats.astype({'target': 'float'})
df_feats = df_data_feats[df_data_feats['is_test']==False]
df_train = df_data[df_data['is_test']==False]
df_test_feats = df_data_feats[df_data_feats['is_test']==True]
df_test = df_data[df_data['is_test']==True]
df_feats=df_feats.drop(columns={'is_test'})
df_train=df_train.drop(columns={'is_test'})
df_test=df_test.drop(columns={'is_test'})
df_test_feats=df_test_feats.drop(columns={'is_test'})
cat_list=['family','city','state','type']
df_le=pd.DataFrame()
for x in cat_list:
    list =train[x].unique()
    x_le = pd.DataFrame(list, columns=[x])
    le=LabelEncoder()
    x_le[x+'_le']=le.fit_transform(x_le[x])
    df_le=df_le.append(x_le)

####  Create Train and Val

In [None]:
df_train['date']= pd.to_datetime(df_train['date'])
df_feats['date']= pd.to_datetime(df_feats['date'])
try:
    df_train=df_train.drop(columns={'Unnamed: 0'})
except: 
    df_train=df_train
try: 
    df_feats=df_feats.drop(columns={'Unnamed: 0'})
except:
    df_feats=df_feats
#Create the split train/validation
df_training,df_validation=train_val_split(df_train,2)
df_feats_use,df_feats_validation=train_val_split(df_feats,2)

#Remove column for features dataframe
cols = [col for col in df_feats.columns if col not in ['date', 'id', "sales", "day",'target']]
Y_train = df_feats_use['sales']
X_train = df_feats_use[cols]
Y_val = df_feats_validation['sales']
X_val = df_feats_validation[cols]

# Data Exploration

In [None]:
train_plot = train.set_index('date')
y = train_plot['sales'].resample('MS').mean() 

result = sm.tsa.seasonal_decompose(y, model='additive')
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(8, 6)

In [None]:
generate_interactive_treemap(train,10, [ 'month', 'family','store_nbr'],'sales','sales',3,'Blues')

# Bayes Optimisation - fine tuning hyper parameter

In [None]:
def lgbm_evaluation(params):
    num_leaves, learning_rate, max_depth, feature_fraction, bagging_fraction, bagging_freq, min_data_in_leaf = params
    
    model = lgb.LGBMRegressor(
        boosting_type='gbdt',
        objective='regression',  # Use 'regression' for regression
        metric='None',           # We set metric to 'None' since we'll use our custom evaluation function
        num_leaves=int(num_leaves),
        learning_rate=learning_rate,
        max_depth=int(max_depth),
        feature_fraction=feature_fraction,
        bagging_fraction=bagging_fraction,
        bagging_freq=int(bagging_freq),
        min_data_in_leaf=int(min_data_in_leaf),
        verbose=-100
    )
    params = model.get_params()
    aliases = [
        {'min_child_weight', 'min_sum_hessian_in_leaf'},
        {'min_child_samples', 'min_data_in_leaf'},
        {'colsample_bytree', 'feature_fraction'},
        {'subsample', 'bagging_fraction'},
        {'subsample_freq','bagging_freq'}
    ]
    for alias in aliases:
        if len(alias & set(params)) == 2:
            arg = np.random.choice(sorted(alias))
            params[arg] = None
    model = lgb.LGBMRegressor(**params)

    model.fit(X_train_temp, Y_train_temp, verbose=False)
    y_pred = model.predict(X_val_temp)

    # Calculate the RMSLE score
    rmsle_score = rmsle_func(Y_val_temp, y_pred)

    # Return the RMSLE score
    return rmsle_score

In [None]:
space = [
    (10, 400),  # num_leaves
    (0.01, 0.5),  # learning_rate
    (5, 200),  # max_depth
    (0.1, 0.9),  # feature_fraction
    (0.1, 0.9),  # bagging_fraction
    (0, 100),  # bagging_freq
    (20, 200)  # min_data_in_leaf
]
# Perform Bayesian Optimization to find the best hyperparameters
result = gp_minimize(lgbm_evaluation, space, n_calls=1000, random_state=42)

# Extract the best hyperparameters and their corresponding score
best_params = {
    'num_leaves': int(result.x[0]),
    'learning_rate': result.x[1],
    'max_depth': int(result.x[2]),
    'feature_fraction': result.x[3],
    'bagging_fraction': result.x[4],
    'bagging_freq': int(result.x[5]),
    'min_data_in_leaf': int(result.x[6])
}

best_score = result.fun  # Convert back to positive, as gp_minimize works with the negation of the score

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:")
print(best_params)
print("Best RMSLE Score:")
print(best_score)

# Train Model

#### Train and validate model

In [None]:
SEARCH_PARAMS = {}

FIXED_PARAMS={'objective': 'regression',
              'metric': 'custom',
              'boosting':'gbdt',
              'min_gain_to_split':0.01,
              'num_boost_round':500,
              'early_stopping_rounds':50}
param = dict(SEARCH_PARAMS,**FIXED_PARAMS)
cols_no_fam = [col for col in df_feats.columns if col not in ['date', 'id', "sales", "day",'target','family']]
for fam in df_feats_use['family'].unique():
    temp=df_feats_use[df_feats_use['family']==fam]
    temp_val=df_feats_validation[df_feats_validation['family']==fam]
    Y_train_temp = temp['sales']
    X_train_temp = temp[cols_no_fam]
    Y_val_temp = temp_val['sales']
    X_val_temp = temp_val[cols_no_fam]
    space = [
        (10, 200),  # num_leaves
        (0.01, 0.5),  # learning_rate
        (5, 100),  # max_depth
        (0.1, 0.9),  # feature_fraction
        (0.1, 0.9),  # bagging_fraction
        (0, 10),  # bagging_freq
        (20, 200)  # min_data_in_leaf
    ]
    # Perform Bayesian Optimization to find the best hyperparameters
    result = gp_minimize(lgbm_evaluation, space, n_calls=100, random_state=42)

    # Extract the best hyperparameters and their corresponding score
    best_params = {
        'num_leaves': int(result.x[0]),
        'learning_rate': result.x[1],
        'max_depth': int(result.x[2]),
        'feature_fraction': result.x[3],
        'bagging_fraction': result.x[4],
        'bagging_freq': int(result.x[5]),
        'min_data_in_leaf': int(result.x[6])
    }

    param = dict(best_params,**FIXED_PARAMS)

    lgbtrain = lgb.Dataset(data=X_train_temp, label=Y_train_temp, feature_name=cols_no_fam)
    lgbval = lgb.Dataset(data=X_val_temp, label=Y_val_temp, reference=lgbtrain, feature_name=cols_no_fam)
    model = lgb.train(params=param,
                    train_set=lgbtrain,
                    valid_sets=[lgbtrain, lgbval],
                    num_boost_round=FIXED_PARAMS['num_boost_round'],
                    early_stopping_rounds=FIXED_PARAMS['early_stopping_rounds'],
                    feval=rmsle_lgbm,
                    verbose_eval=50)
    save_model(DECISIONTREE_PATH,model,f'lgbm_valid_{fam}.pkl')

In [None]:
#save_model(DECISIONTREE_PATH,model,'lgbm_valid.pkl')

##### Model information

In [None]:
plot_lgb_importances(model,30)

### Train model on full train data

In [None]:
SEARCH_PARAMS = {'num_leaves': 200, 'learning_rate': 0.5, 'max_depth': 100, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 10, 'min_data_in_leaf': 23}

FIXED_PARAMS={'objective': 'regression',
              'metric': 'custom',
              'boosting':'gbdt',
              'min_gain_to_split':0.01,
              'num_boost_round':1000}
param = dict(SEARCH_PARAMS,**FIXED_PARAMS)
cols_no_fam = [col for col in df_feats.columns if col not in ['date', 'id', "sales", "day",'target','family']]
for fam in df_feats['family'].unique():
    temp=df_feats[df_feats['family']==fam]
    Y_train_temp = temp['sales']
    X_train_temp = temp[cols_no_fam]
    model=load_model(DECISIONTREE_PATH,f'lgbm_valid_{fam}.pkl')
    param= model.params
    try:
        param.pop('early_stopping_round')
    except:
        param=param
    lgbtrain = lgb.Dataset(data=X_train_temp, label=Y_train_temp, feature_name=cols_no_fam)
    final_model = lgb.train(params=param,
                  train_set=lgbtrain,
                  num_boost_round=model.best_iteration,
                  feval=rmsle_lgbm)
    save_model(DECISIONTREE_PATH,model,f'lgbm_all_{fam}.pkl')

# Create Submission

##### Prediction

In [None]:
submission=pd.DataFrame()
cols_no_fam = [col for col in df_feats.columns if col not in ['date', 'id', "sales", "day",'target','family']]
for fam in df_feats['family'].unique():
    print(fam)
    for sto in df_feats.store_nbr.unique():
        temp=df_test_feats[(df_test_feats['family']==fam) & (df_test_feats['store_nbr']==sto)]
        temp_test = test[(test['family']==df_le[df_le['family_le']==fam].family.item()) & (test['store_nbr']==sto)]
        temp_test = temp_test.loc[temp_test.sales.isna()]
        X_test = temp[cols_no_fam]
        train_temp=df_feats[(df_feats['family']==fam) & (df_feats['store_nbr']==sto)]
    
        final_model=load_model(DECISIONTREE_PATH,f'lgbm_all_{fam}.pkl')
        test_preds = final_model.predict(X_test, num_iteration=final_model.best_iteration)
        # Zeroes the prediction
        for i in range(len(test_preds)):
            test_preds[i]=max(0,test_preds[i])
            # If last 21 days are 0s then 0
            if train_temp.loc[-21:].sales.sum()==0:
                try:
                    test_preds.loc[i]=0
                except:
                    test_preds[i]=0
        submission_df = temp_test.loc[:, ['id', 'sales']]
        submission_df['sales'] = test_preds
        submission_df['id'] = submission_df.id.astype(int)        
        submission=submission.append(submission_df)


Output

In [None]:
#Sort Values
submission=submission.sort_values('id')

#Saving the submission file !!!Update the name of the file!!!
submission.to_csv(SUBMISSION_PATH/'submission_lgbm_11.csv', index=False)