In [70]:
import logging

import pandas as pd
import numpy as np
from math import sqrt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
#from sklearn.grid_search import GridSearchCV

from urllib.parse import urlparse
import mlflow 
import mlflow.sklearn 

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [71]:
from sklearn.base import BaseEstimator, TransformerMixin

In [75]:
#Get datetime features
class DateExtractor(BaseEstimator,TransformerMixin):
    def __init__(self):
        print('Pre processor initiated')
    
    def fit(self,X,y):
        return self
    
    def transform(self,x):
        dataset = x.copy()
        
        dataset['month'] = dataset.index.month
        dataset['year'] = dataset.index.year
        dataset['day'] = dataset.index.day
        dataset['weekofYear'] = dataset.index.weekofyear
        dataset.reset_index(inplace=True)
        dataset.drop(['Date','Sales'], axis=1, inplace=True)
        
        dataset = pd.get_dummies(dataset)
        return dataset

In [76]:
def rmse(x, y):
    return sqrt(mean_squared_error(x, y))

# definte MAPE function
def mape(x, y): 
    return np.mean(np.abs((x - y) / x)) * 100  
  

In [84]:
try:
    train = pd.read_csv(r"..\data\train.csv",parse_dates = True, low_memory = False,index_col = 'Date')
    store = pd.read_csv(r'..\data\store.csv',low_memory = False)
    test = pd.read_csv(r'..\data\test.csv',low_memory = False)
except Exception as e:
    logger.exception("Unable to load csv file", e)

with mlflow.start_run():
    X = train.copy()
    y= train.Sales
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model_pipeline = Pipeline(
            steps = [
                ('datetime_features', DateExtractor()),
                ('rdf',RandomForestRegressor(n_estimators=30))
            ]
    )
    parameters = {
    'rdf__max_depth': (150, 155, 160),
    'rdf__min_samples_split': (1, 2, 3),
    'rdf__min_samples_leaf': (1, 2, 3)
    }
     
    model_pipeline.fit(X_train,y_train)
    y_pred = model_pipeline.predict(X_test)
    print("Regresion Model Score" , ":" , model_pipeline.score(X_train, y_train) , "," ,
          "Out of Sample Test Score" ,":" , model_pipeline.score(X_test, y_test))
            
        
    mlflow.log_metric("r", model_pipeline.score(X_test, y_test))
    mlflow.log_metric("rmse", rmse(y_test, y_pred))
    mlflow.log_metric("mape", mape(y_test, y_pred))
        
    mlflow.sklearn.log_model(model_pipeline, "model")


Pre processor initiated


  dataset['weekofYear'] = dataset.index.weekofyear
  dataset['weekofYear'] = dataset.index.weekofyear
  dataset['weekofYear'] = dataset.index.weekofyear
  dataset['weekofYear'] = dataset.index.weekofyear


Regresion Model Score : 0.993016919266062 , Out of Sample Test Score : 0.9541499810823433


  dataset['weekofYear'] = dataset.index.weekofyear
