# Working with mlflow and prediction

In [6]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.preprocessing import MinMaxScaler
# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [10]:

# 
sys.path.append(os.path.abspath(os.path.join('..')))

logging.basicConfig (level = logging .WARN )
logger= logging.getLogger (__name__ )

#Get url from DVC
import dvc.api

path='data/train.csv'
repo ='C:/Users/Ekubay/Documents/Rossmann_Pharmaceutical_sales_Prediction'
rev = 'ver_4'
data_url = dvc.api.get_url(path=path, repo=repo, rev=rev)
#df = pd.read_csv(data_url)

mlflow.set_experiment('prediction')

def eval_metrics ( actual , pred ) :
    rnse = np.sqrt ( mean_squared_error ( actual , pred ) )
    mae = mean_absolute_error ( actual , pred )
    r2 = r2_score ( actual , pred )
    return rnse , mae , r2



In [26]:
# train model
def train_model(X, Y, model_type='LinearRegression'):
    model = LinearRegression()
    
    if(model_type == 'RandomForest'):
        model = RandomForestRegressor(n_estimators = 100, max_depth=5, random_state=0)
    
        
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):
        with mlflow.start_run(run_name=model_type, nested = True):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]

            model.fit(X_train, y_train)
            score = model.score(X_test, y_test)
            print(f"Prediction Score of the {model_type} is {round(score * 100, 2)}%")
            
            if(model_type == 'LinearRegression'):
                mlflow.log_param('LR-Score', score)
                
            if(model_type == 'RandomForest'):
                mlflow.log_param('RF-Score', score)
                
            mlflow.sklearn.log_model(model, model_type +' Model')
            mlflow.end_run()
            
    print()

In [31]:
# main 
if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)
# Read the samrt ad from the remote repository
df_train = pd.read_csv (data_url)
#df_train = get_data('ver_4')
# Log data params
mlflow.log_param ( ' data_url', data_url)
mlflow.log_param ( ' data version ' , rev )
mlflow.log_param ( ' input_rows ' , df_train.shape [0] )
mlflow.log_param ( ' input_cols ' , df_train.shape [ 1 ])

# checking
#print(data['response'])
# Split the data into training and test sets . ( 0.75 , 0.25 ) split .
columns = ['Sales','Store', 'DayOfWeek', 'Open', 'Promo',  'SchoolHoliday', 'Day', 'WeekOfYear','Month', 'Year', 'StoreType',
              'Assortment','CompetitionDistance', 'Promo2']

feature_columns = ['Store', 'DayOfWeek', 'Open', 'Promo',  'SchoolHoliday', 'Day', 'WeekOfYear','Month', 'Year', 'StoreType',
              'Assortment','CompetitionDistance', 'Promo2']

sample_size = 1000
sampled_df = df_train[columns].sample(sample_size)

## Separet Test and train files
train_x = sampled_df[feature_columns]
train_y = sampled_df[['Sales']]
#train_x.head()

#
lb = LabelEncoder()
train_x['StoreType'] = lb.fit_transform(train_x['StoreType'])
train_x['Assortment'] = lb.fit_transform(train_x['Assortment'])

#train_x.info()
# scaling
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()


#X = x_scaler.fit_transform(train_x)
X = train_x
Y = y_scaler.fit_transform(train_y)

# split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# training
#rf_reg = RandomForestClassifier(n_estimators = 100, max_depth=25, random_state=0)
rf_reg.fit(X_train, y_train)

# score of the model
score = rf_reg.score(X_test, y_test)
print(f"Prediction Score of the Model is {round(score * 100, 2)}%")

#
cols_x = pd.DataFrame (list(train_x.columns))
cols_x.to_csv('features.csv', header = False , index = False )
mlflow.log_artifact('features.csv')

cols_y = pd.DataFrame (list(train_y.columns))
cols_y.to_csv('targets.csv', header = False , index = False )
mlflow.log_artifact('targets.csv')

# alpha = float ( sys.argv [ 1 ] ) if len ( sys.argv ) > 1 else 0.5
# l1_ratio = float ( sys.argv [ 2 ] ) if len ( sys.argv ) > 2 else 0.5
# lr = ElasticNet(alpha = alpha , l1_ratio = l1_ratio , random_state = 42 )
# lr.fit ( train_x , train_y )

#Write scores to a file
# with open("metrics.txt", 'w') as outfile:
#     outfile.write("an_absolute_error: %2.1f%%\n" % rnse)
#     outfile.write("r2_score: %2.1f%%\n" %r2)
        

### Runing the mlflow from notebook

In [None]:
!mlflow ui

In [28]:
#print(f"Prediction Score of the Model is {round(score * 100, 2)}%")
