In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import mlflow
import pickle
import logging
import sys,os,datetime
sys.path.insert(0,'../scripts/')
from data_fetch import get_data
from loss_function import mae,rmse

In [2]:
logging.basicConfig(filename='../log/log.log', filemode='a',encoding='utf-8', level=logging.DEBUG)
mlflow.set_experiment("Predicting sales using Random Forest Regressor")

<Experiment: artifact_location='file:///C:/Users/User/Desktop/Pharmaceutical-Sales-Prediction/notebooks/mlruns/2', experiment_id='2', lifecycle_stage='active', name='Predicting sales using Random Forest Regressor', tags={}>

In [3]:
train_data = get_data('data/train_processed.csv','C:/Users/User/Desktop/Pharmaceutical-Sales-Prediction','train_processed_v1')
mlflow.log_param('Data version', 'train_processed_v1')
mlflow.log_param('Model', 'Random Forest Regressor')


In [4]:
#train_data = train_data.head(n=100000)
train_data.set_index('Date',inplace=True)
y = train_data['Sales']
x = train_data.drop('Sales', axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [5]:
n_estimators = 35
max_depth=20
random_state=5
pipeline = Pipeline(steps = [('preprocessor', StandardScaler()),('model',RandomForestRegressor(n_estimators = n_estimators,max_depth=max_depth, random_state=random_state))])
mlflow.log_param('Number of estimators',n_estimators)
mlflow.log_param('Max depth',max_depth)
mlflow.log_param('Random state',random_state)

In [6]:
random_forest_model = pipeline.fit(X_train, y_train)

In [7]:
Score = random_forest_model.score(X_test, y_test)
y_pred= random_forest_model.predict(X_test)
report = str(train_data.columns)
mse = mean_squared_error(y_test, y_pred)
mlflow.log_metric('Score',Score)
mlflow.log_metric('Mean Squared Error',mse)
if not os.path.exists("Random Forest Regression reports"):
    os.makedirs("Random Forest Regression reports")
with open("Random Forest Regression reports/report.txt", "w") as f:
            f.write(report)
mlflow.log_artifacts("Random Forest Regression reports")

In [9]:
importance = random_forest_model.named_steps["model"].feature_importances_
fi_df = pd.DataFrame()
fi_df['feature'] = X_train.columns.to_list()
fi_df['feature_importances'] = importance
fi_df

Unnamed: 0,feature,feature_importances
0,Unnamed: 0,0.011043
1,Store,0.004405
2,DayOfWeek,0.004616
3,Customers,0.861061
4,Open,0.0
5,Promo,0.018134
6,StateHoliday,0.000101
7,SchoolHoliday,0.000217
8,Year,0.000625
9,Month,0.002336


In [None]:
random_forest_model_path = '../models/' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + '.pkl'
pickle.dump(random_forest_model, open(random_forest_model_path, 'wb'))