In [53]:
#importing Libraries
import pandas as pd
import dvc.api
import os
import sys
import numpy as np
import mlflow

In [52]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [54]:
#importing local modules
#from scripts.ML_modelling_utils import *
sys.path.append(os.path.abspath(os.path.join('data')))
sys.path.insert(0,'../scripts/')
from Results import result_picker
from data_manipulation import DataManipulator
from data_information import DataInfo
from cleaner import DataCleaner
import mlflow
sns.set()

results = result_picker()

# loading Data

In [55]:
merged_data = pd.read_csv('../data2/train_store.csv')
y_values = merged_data['Sales']
x_values = merged_data.drop(['Sales'], axis=1)

# spliting Data

In [67]:
X_train, X_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print ("split was successful.")

split was successful.


In [69]:
#Define the Pipeline
model_pipeline = Pipeline(steps=[('random_forest', RandomForestRegressor(max_depth=10,random_state=2))])

In [70]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    print("RMSE Score is: {:.5%}".format(rmse))
    print("R2 Square Score is: {:.5%}".format(r2))
    print("MAE Score is: {:.5%}".format(mae))
    return {f'RMSE Score': rmse, f'R2_Squared': r2, f'MAE Score': mae}

In [None]:
mlflow.autolog(log_input_examples=True, disable_for_unsupported_versions=True, silent=True)
with mlflow.start_run() as run:
    best_model = model_pipeline.fit(X_train, y_train)

    train_score = best_model.score(X_train, y_train)
    valid_score = best_model.score(X_valid, y_valid)
    valid_metrics = eval_metrics(y_valid, best_model.predict(X_valid))
    test_score = best_model.score(X_test, y_test)
    test_metrics = eval_metrics(y_test, best_model.predict(X_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)

# Parameter Tunning

In [72]:
merged_data['Sales'] = numeric_transformer.fit_transform(merged_data[["Sales"]])

In [73]:
numeric_features = merged_data.select_dtypes(include=['int64', 'float64','uint8','uint16','float32']).columns

categorical_features = merged_data.select_dtypes(include=['object']).columns

class make_present_col_selector_class:
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns

    def __call__(self, df):
        return [col for col in df.columns if col in self.selected_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, make_present_col_selector_class(numeric_features)), 
        ('categorical', categorical_transformer, make_present_col_selector_class(categorical_features))
    ])

In [74]:
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [{
     'random_forest': [RandomForestRegressor(max_depth=10, random_state=2)],
     'random_forest__bootstrap': [True, True],
     'random_forest__ccp_alpha': [0.0],
     'random_forest__criterion': ['mse'],
     'random_forest__max_depth': [5,8,15],
     'random_forest__max_features': ['auto'],
#      'random_forest__max_leaf_nodes': None,
#      'random_forest__max_samples': None,
     'random_forest__min_impurity_decrease': [0.0],
#      'random_forest__min_impurity_split': None,
     'random_forest__min_samples_leaf': [1,5],
     'random_forest__min_samples_split': [2],
     'random_forest__min_weight_fraction_leaf': [0.0],
     'random_forest__n_estimators': [100],
#      'random_forest__n_jobs': None,
     'random_forest__oob_score': [False, True],
     'random_forest__random_state': [2],
     'random_forest__verbose': [0],
     'random_forest__warm_start': [True, False]
}]

    
# create a gridsearch of the pipeline, the fit the best model
grid_search_pipeline = GridSearchCV(
    model_pipeline, grid_param, cv=3, verbose=0, n_jobs=-1)  # Fit grid search

In [75]:
def generate_model_name(mse_score: float) -> str:
    now = datetime.now()
    dt_string = now.strftime("%d-%m-%Y-%H-%M-%S-")
    dt_string = dt_string + "{:.2%}".format(mse_score)
    return dt_string

In [79]:
pipeline = Pipeline(steps=[
    # ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
save_model(best_model, test_metrics['RMSE Score'])

In [None]:
best_model.get_params()

# Saving Store Reference for prediction purpose

In [80]:
store = pd.read_csv('../data/store.csv')
store_manipulatior = DataManipulator(store)
store_info = DataInfo(store)
store_cleaner = DataCleaner(store)

In [81]:
store_info.get_missing_description()

The total number of missing values is 2343
21.01 % missing values.


In [82]:
store_info.get_column_based_missing_percentage()

Unnamed: 0,total_missing_values,missing_percentage
Store,0,0.0 %
StoreType,0,0.0 %
Assortment,0,0.0 %
CompetitionDistance,3,0.27 %
CompetitionOpenSinceMonth,354,31.75 %
CompetitionOpenSinceYear,354,31.75 %
Promo2,0,0.0 %
Promo2SinceWeek,544,48.79 %
Promo2SinceYear,544,48.79 %
PromoInterval,544,48.79 %


In [83]:
# Fill missing numeric values
store_manipulatior.fill_columns_with_max(store_info.get_numeric_columns())
# Fill non-numeric values (categorical values)
store_manipulatior.fill_columns_with_most_frequent(store_info.get_object_columns())

In [84]:
store_info.get_missing_description()

The total number of missing values is 0
0.0 % missing values.


In [85]:
# Label Object Columns
store_manipulatior.label_columns(store_info.get_object_columns())


{'StoreType': LabelEncoder(),
 'Assortment': LabelEncoder(),
 'PromoInterval': LabelEncoder()}

In [86]:
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   int32  
 2   Assortment                 1115 non-null   int32  
 3   CompetitionDistance        1115 non-null   float64
 4   CompetitionOpenSinceMonth  1115 non-null   float64
 5   CompetitionOpenSinceYear   1115 non-null   float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            1115 non-null   float64
 8   Promo2SinceYear            1115 non-null   float64
 9   PromoInterval              1115 non-null   int32  
dtypes: float64(5), int32(3), int64(2)
memory usage: 74.2 KB


# Saving Model Column Order Information For Prediciton Later

In [88]:
# Column Inputs Orders
results.add_data('model_input_columns',x_train.columns.to_list())

In [None]:
#results.save_data('../models/column_reference.pkl')