In [3]:
""" UNCOMMENT THE FOLLOWING LINES IN ORDER TO INSTALL THE DEPENDENCIES"""
#!pip install numpy
# !pip install plotly
#!pip install pandas
#!pip install sklearn

"""
Import pandas to handle and analyse data
Import numpy to perform complex calculations on data
"""
import pandas
import numpy
import scipy

"""
Import datetime module to process datestamps
"""
from datetime import date, timedelta

"""
Import os to walk through paths or make directories
"""
import os

"""
Import plotly library to plot graphs
"""
import plotly.express as px
import plotly.io as pio
pandas.options.plotting.backend = "plotly"

"""
To evaluate the error and loss
"""
from sklearn import metrics

"""
Importing Machine Learning Models
"""
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor

"""
Save Machine Learning Model locally
"""
import pickle

"""
Import warning module to ignore the future warnings
"""
import warnings
warnings.filterwarnings('ignore')

"""
Time module to calculate the time required for training
"""
import time

import calendar
calendar.setfirstweekday(6)

print("Libraries has been loaded successfully")

Libraries has been loaded successfully


In [4]:
"""
Load data from provided csv file
"""
raw = pandas.read_csv(r"./Raw Data.csv")
print("Data has been loaded successfully")

"""
Select important features only
"""
raw = raw[['date', 'entry_id', 'item_id', 'item_unit_id', 'price_min', 'price_max', 'price_avg', 'period', 'currency']]
print("Columns has been selected successfully")

"""
Rename the columns here
to keep the same variable structure
"""
raw = raw.rename(columns={'date':'Date',
                          'entry_id': 'Entry ID',
                          'item_id': 'Item ID',
                          'item_unit_id': "Item Unit ID",
                          'price_min': 'Price Min',
                          'price_max': 'Price Max',
                          'price_avg': 'Price Avg',
                          'period': 'Period',
                          'currency':"Currency"
                         })
print("Renamed successfully")

"""
Extract distinct Entry ID of each product
"""
entryID = raw['Entry ID'].unique().tolist()
print("Gathered Unique Entry ID successfully")

"""
Groupy raw data by Entry ID
Sort by Date stamp
"""
raw = raw.sort_values(["Entry ID", "Date"],ascending=True)
print("Sorted successfully")

Data has been loaded successfully
Columns has been selected successfully
Renamed successfully
Gathered Unique Entry ID successfully
Sorted successfully


In [6]:
"""
Union between two dataframes
Union based of specific column between dataframes
"""
def mergeRight(DF1, DF2, targetColumn):
    return pandas.merge(DF1, DF2, on = targetColumn, how = "right")

"""
Intersection between two dataframes
Intersection based of common column between dataframes
"""
def mergeLeft(DF1, DF2, targetColumn):
    return pandas.merge(DF1, DF2, on = targetColumn, how = "left")

"""
Generate the range of date stamps and type is dataframeProcess with start and end dates
interval: day (d), week(w), month(m)
"""
def getDateList(start_date, end_date, frequency):
    if frequency == "d" or frequency == "w" or frequency == "m":
        return pandas.DataFrame(pandas.date_range(start_date,
                                                  end_date,
                                                  freq = frequency) + timedelta(days=1)).rename(columns={0:'Date'}).astype(str)

    else:
        print("ERROR IN GENERATING DATES LIST: WRONG FREQUENCY PARAMETER")

"""
Collect missing dates data and add it into raw data
It also returns the percentage of how much missing datestamp in the data 
"""     
def collectMissingData(data, start_date, end_date, frequency):
    try:
        dateRanges = getDateList(start_date, end_date, frequency)
        data["Date"] = data["Date"].astype(str)
        LM = mergeLeft(dateRanges, data["Date"].astype(str), "Date")
        RM = mergeRight(data, LM, "Date")
        return RM
    except Exception as e:
        print("ERROR IN COLLECTING MISSING DATE INFORMATIONS: " + str(e))

In [13]:
"""
Generates the series dataset for forecasting
Note: 1D shape data should be converted to list before calling the function for that data or reshape the array/dataframe to (-1,1)
Note: Greater than 1D dataframe/array processed normally No need to reshape
"""    
def series_to_supervised(data, n_in=1, n_out=1, dropnan=False):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pandas.DataFrame(data)
    cols, names = list(), list()
    """
    input sequence (t-n, ... t-1)
    """
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    """
    forecast sequence (t, t+1, ... t+n)
    """
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    """
    put it all together
    """
    agg = pandas.concat(cols, axis=1)
    agg.columns = names
    
    """
    drop rows with NaN values
    """
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [14]:
"""
Create directory over path provided
Check if the directory exists, otherwise create it
"""
def makeDir(path):
    if not os.path.exists(path):
        os.makedirs(path)

"""
Calculate percentage
"""
def calculatePercentage(obtain, total):
    return (obtain/total)*100

"""
Calculate moving average 
Process only one column of dataframe
"""
def movingAverage(data, window):
    return data.rolling(window, min_periods=1).mean()

"""
Function for applying scaling
Min-Max Scaling
"""
def minmax_scale(dataframe, Min, Max):
    return (dataframe - Min) / (Max - Min)

"""
Function for applying reverse scaling
Inverse Min-Max Scaling
"""
def reverse_minmax_scale(dataframe, Min, Max):
    return (dataframe * (Max - Min)) + Min

"""
Return the Mean Square Error
"""
def meanSquareError(true, predicted):
    return metrics.mean_squared_error(true, predicted)

"""
Return the Mean Absolute Error
"""
def meanAbsouluteError(true, predicted):
    return metrics.mean_absolute_error(true, predicted)

"""
Return the Mean Absolute Percentage Error
"""
def meanAbsolutePercentageError(true, predicted):
    return metrics.mean_absolute_percentage_error(true, predicted)

"""
Evalute model results and store into "Results" List
"""
def model_result(Model, trainX, trainY, testX, testY):
    Train_mse = meanAbsolutePercentageError(trainY, Model.predict(trainX))
    Test_mse = meanAbsolutePercentageError(testY, Model.predict(testX))
    return Train_mse, Test_mse

"""
Save model weigths to disk
"""
def dumpModel(path, model):
    pickle.dump(model, open(path, 'wb'))
    
def getMissingDataPercentage(data):
    missing = data[data.columns.tolist()[-1]].isnull().sum()
    total = len(data)
    percentage = calculatePercentage(missing, total)
    return percentage

def missingPercentage(data):
    initDate = dateInit(data)
    data = data[data["Date"] >= initDate]
    missing = data[data.columns.tolist()[-1]].isnull().sum()
    total = len(data)
    percentage = calculatePercentage(missing, total)
    return percentage

def dateInit(data):
    return data["Date"][data.index == data["Date"][~data[data.columns.tolist()[-1]].isnull()].index[0]].iloc[0]

def get_week_of_month(year, month, day):
    x = pandas.DataFrame(calendar.monthcalendar(year, month))
    x = x[x == day].dropna(axis = 0, how = 'all')
    return x.index

In [15]:
"""
Create Linear Regression fiiting function, it returns the fitted model along with training and testing loss
"""
def linearRegression(trainX, trainY, testX, testY):
    model = LinearRegression(copy_X= True, fit_intercept= True, positive=False)
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create Decision Tree fiiting function, it returns the fitted model along with training and testing loss
"""
def DTree(trainX, trainY, testX, testY):
    model = DecisionTreeRegressor()
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create Random Forest fiiting function, it returns the fitted model along with training and testing loss
"""
def RFTree(trainX, trainY, testX, testY):
    model = RandomForestRegressor()
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create Bagging Regressor fiiting function, it returns the fitted model along with training and testing loss
"""
def BRegressor(trainX, trainY, testX, testY):
    model = BaggingRegressor()
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create Extra Trees Regressor fiiting function, it returns the fitted model along with training and testing loss
"""
def EXTree(trainX, trainY, testX, testY):
    model = ExtraTreesRegressor()
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create Support Vector Machine Regressor fiiting function, it returns the fitted model along with training and testing loss
"""
def SVMReg(trainX, trainY, testX, testY):
    model = svm.SVR()
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create k-nearest neighbors Regressor fiiting function, it returns the fitted model along with training and testing loss
"""
def KNN(trainX, trainY, testX, testY):
    model = KNeighborsRegressor()
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

"""
Create MLP fiiting function, it returns the fitted model along with training and testing loss
"""
def NeuralNetwork(trainX, trainY, testX, testY):
    model = MLPRegressor(hidden_layer_sizes=(10,5),
                         random_state=1,
                          activation='relu',
                          solver='adam',
                          max_iter=2000
                         )
    model.fit(trainX,trainY)
    trainLoss, testLoss = model_result(model, trainX, trainY, testX, testY)
    return trainLoss, testLoss, model

In [16]:
"""
*** IMPORTANT ***

CREATE THE NEW LINE ABOVE AND ADD MORE MACHINE LEARNING MODELS FUNCTION, SIMILAR TO ABOVE PATTERN
AFTER CREATING NEW MACHINE LEARNING MODEL FUNCTIONS, ADD ITS NAME AND FUNCTION NAME TO MODEL LIST BELOW
"""
modelList = {
             #"Multivariate Linear Regression": "linearRegression",
             #"Decission Tree": "DTree",
             #"Random Forest": "RFTree",
             #"Bagging Regressor": "BRegressor",
             #"Extra Trees Regressor": "EXTree",
             "K-Nearest Neighbors":"KNN",
             "Support Vector Regressor": "SVMReg",
             "Neural Network": "NeuralNetwork"
            }
"""
YOU CAN COMMENT OR UNCOMMENT THE LINES IN MODEL LIST, IT WILL DECIDE WHETHER TO RUN THAT MODEL FITTING OR NOT

*** IMPORTANT ***
"""

'\nYOU CAN COMMENT OR UNCOMMENT THE LINES IN MODEL LIST, IT WILL DECIDE WHETHER TO RUN THAT MODEL FITTING OR NOT\n\n*** IMPORTANT ***\n'

In [17]:
"""
Variables to initialize the date stamp generation
"""
start_date = date(2019, 1, 1)
end_date = date.today()
frequency = "w"

In [18]:
%%time
"""
Create dataframe of data ranges between start and end date over frequency/interval of week
"""
dataMin = getDateList(start_date, end_date, frequency)
dataMax = getDateList(start_date, end_date, frequency)
dataAvg = getDateList(start_date, end_date, frequency)
"""
Create History dataframe to store important information about each entry ID
"""
History = pandas.DataFrame(columns = ["Entry ID", 
                                      "Item ID",
                                      "Item Unit ID",
                                      "Period",
                                      "Currency",
                                      "Price Min: Max",
                                      "Price Max: Max",
                                      "Price Avg: Max"
                                     ])

"""
Extract important information for each entry
1. Minimum price
2. Maximum price
3. Average price
3. Collect missing dates and add them to each entry
4. Pivot the dataframe to apply further pre-processing cleanly
"""
for _ in entryID:
    temp = raw[["Date", "Price Min"]][raw["Entry ID"] == _]
    temp1 = raw[["Date", "Price Max"]][raw["Entry ID"] == _]
    temp2 = raw[["Date", "Price Avg"]][raw["Entry ID"] == _]
    temp = collectMissingData(temp, start_date, end_date, frequency)
    temp1 = collectMissingData(temp1, start_date, end_date, frequency)
    temp2 = collectMissingData(temp2, start_date, end_date, frequency)
    temp = temp.rename(columns={"Price Min": _ })
    temp1 = temp1.rename(columns={"Price Max": _ })
    temp2 = temp2.rename(columns={"Price Avg": _ })
    alpha = pandas.DataFrame([_,
                              raw["Item ID"][raw["Entry ID"] == _].unique()[0],
                              raw["Item Unit ID"][raw["Entry ID"] == _].unique()[0],
                              raw["Period"][raw["Entry ID"] == _].unique()[0],
                              raw["Currency"][raw["Entry ID"] == _].unique()[0],
                              temp[_].max(),
                              temp1[_].max(),
                              temp2[_].max(),
                             ]).T.rename(columns={0:"Entry ID",
                                                  1:"Item ID",
                                                  2:"Item Unit ID",
                                                  3:"Period",
                                                  4:"Currency",
                                                  5:"Price Min: Max",
                                                  6:"Price Max: Max",
                                                  7:"Price Avg: Max"})
    History = pandas.concat([History, alpha],axis=0)
    dataMin = mergeRight(dataMin, temp, "Date")
    dataMax = mergeRight(dataMax, temp1, "Date")
    dataAvg = mergeRight(dataAvg, temp2, "Date")

CPU times: user 23min 41s, sys: 12 s, total: 23min 52s
Wall time: 23min 52s


In [19]:
"""
Clear the memory: Raw data is no more required for further process
"""
raw = None

In [20]:
%%time
History["Missing Percentage From Max Count"] = None
History["Grade"] = None
for _ in entryID:
    percentage = getMissingDataPercentage(dataMin[["Date", _]])
    History["Missing Percentage From Max Count"][History["Entry ID"] == _] = percentage
History["Grade"][History["Missing Percentage From Max Count"] <= 60 ] = "B"
History["Grade"][History["Missing Percentage From Max Count"] <= 30 ] = "A"
History["Grade"][History["Missing Percentage From Max Count"] > 60 ] = "C"

"""
Add another column in History, the safe max is used in min-max scaling
After model training there must be some levarge to process rising values
"""
History["Price Min: Safe Max"] = History["Price Min: Max"] * 1.2
History["Price Max: Safe Max"] = History["Price Max: Max"] * 1.2

"""
Add more columns to History to store train and test dataset paths
"""
History["Trainable"] = False

History[["Price Min: Train X",
         "Price Min: Train Y",
         "Price Min: Test X",
         "Price Min: Test Y"]] = None

History[["Price Max: Train X",
         "Price Max: Train Y",
         "Price Max: Test X",
         "Price Max: Test Y"]] = None

"""
Process to create train and test datasets and store them locally while keeping there paths to history dataframe
"""
TrainSplit = 0.7

PriceMinTrainPathX = "Dataset/Price Min Dataset/Train X"
PriceMinTrainPathY = "Dataset/Price Min Dataset/Train Y"
PriceMinTestPathX = "Dataset/Price Min Dataset/Test X"
PriceMinTestPathY = "Dataset/Price Min Dataset/Test Y"

PriceMaxTrainPathX = "Dataset/Price Max Dataset/Train X"
PriceMaxTrainPathY = "Dataset/Price Max Dataset/Train Y"
PriceMaxTestPathX = "Dataset/Price Max Dataset/Test X"
PriceMaxTestPathY = "Dataset/Price Max Dataset/Test Y"

makeDir(PriceMinTrainPathX)
makeDir(PriceMinTrainPathY)
makeDir(PriceMinTestPathX)
makeDir(PriceMinTestPathY)

makeDir(PriceMaxTrainPathX)
makeDir(PriceMaxTrainPathY)
makeDir(PriceMaxTestPathX)
makeDir(PriceMaxTestPathY)

#Run loop over all entries
for _ in entryID:
    #Select date column from data
    tempMin = pandas.DataFrame(dataMin["Date"])
    tempMax = pandas.DataFrame(dataMax["Date"])

    tempMin["Month"] = tempMin["Date"].str.split("-").str[1].astype(int)/12
    tempMin["Month"] = tempMin["Month"].round(decimals=2)
    tempMin["Week Number"] = pandas.DataFrame (get_week_of_month(int(j.split("-")[0]),
                            int(j.split("-")[1]),
                            int(j.split("-")[2])
                            )/5 for j in tempMin["Date"])
    
    tempMax["Month"] = tempMax["Date"].str.split("-").str[1].astype(int)/12
    tempMax["Month"] = tempMax["Month"].round(decimals=2)
    tempMax["Week Number"] = pandas.DataFrame (get_week_of_month(int(j.split("-")[0]),
                            int(j.split("-")[1]),
                            int(j.split("-")[2])
                            )/5 for j in tempMax["Date"])
    
    #Select safe max for each entry
    priceMinMax = History["Price Min: Safe Max"][History["Entry ID"] == _]
    priceMaxMax = History["Price Max: Safe Max"][History["Entry ID"] == _]
    
    #Apply scaling over price and covert to series to supervised as timeseries object
    tempMin[["T-3", "T-2", "T-1", "T"]] = series_to_supervised(minmax_scale(pandas.DataFrame(dataMin[_]), 0, priceMinMax.iloc[0]), 3, 1)
    tempMax[["T-3", "T-2", "T-1", "T"]] = series_to_supervised(minmax_scale(pandas.DataFrame(dataMax[_]), 0, priceMaxMax.iloc[0]), 3, 1)

    
    #Delete all missing values
    tempMin = tempMin[tempMin['T-3'].notna()]
    tempMin = tempMin[tempMin['T-2'].notna()]
    tempMin = tempMin[tempMin['T-1'].notna()]
    tempMin = tempMin[tempMin['T'].notna()]
    
    tempMax = tempMax[tempMax['T-3'].notna()]
    tempMax = tempMax[tempMax['T-2'].notna()]
    tempMax = tempMax[tempMax['T-1'].notna()]
    tempMax = tempMax[tempMax['T'].notna()]

    #Shuffle the data and reset it indexes
    tempMin = tempMin.sample(n=len(tempMin), random_state=9).reset_index(drop=True)
    tempMax = tempMax.sample(n=len(tempMax), random_state=9).reset_index(drop=True)
    
    # Change to percentage or set threshold to 0, in order to consider all entries
    if len(tempMin) >= 10 and len(tempMax) >= 10:
        
        #Change Trainable instance to true
        History["Trainable"][History["Entry ID"] == _] = True
        
        #Extract features from entry data
        XMin = tempMin[tempMin.columns.tolist()[:-1]]
        XMax = tempMax[tempMax.columns.tolist()[:-1]]
        
        #Extract labels from entry data
        YMin = tempMin[tempMin.columns.tolist()[-1:]]
        YMax = tempMax[tempMax.columns.tolist()[-1:]]

        #Split data into 70% Training and 30% Testing, while storing them to disk 
        XMin[:int(len(XMin) * TrainSplit)].to_csv(PriceMinTrainPathX + "/" + str(int(_)) + ".csv",index=False)
        History["Price Min: Train X"][History["Entry ID"] == _] = PriceMinTrainPathX + "/" + str(int(_)) + ".csv"
        YMin[:int(len(YMin) * TrainSplit)].to_csv(PriceMinTrainPathY + "/" + str(int(_)) + ".csv",index=False)
        History["Price Min: Train Y"][History["Entry ID"] == _] = PriceMinTrainPathY + "/" + str(int(_)) + ".csv"
        XMin[int(len(XMin) * TrainSplit):].to_csv(PriceMinTestPathX + "/" + str(int(_)) + ".csv",index=False)
        History["Price Min: Test X"][History["Entry ID"] == _] = PriceMinTestPathX + "/" + str(int(_)) + ".csv"
        YMin[int(len(YMin) * TrainSplit):].to_csv(PriceMinTestPathY + "/" + str(int(_)) + ".csv",index=False)
        History["Price Min: Test Y"][History["Entry ID"] == _] = PriceMinTestPathY + "/" + str(int(_)) + ".csv"
        
        XMax[:int(len(XMax) * TrainSplit)].to_csv(PriceMaxTrainPathX + "/" + str(int(_)) + ".csv",index=False)
        History["Price Max: Train X"][History["Entry ID"] == _] = PriceMaxTrainPathX + "/" + str(int(_)) + ".csv"
        YMax[:int(len(YMax) * TrainSplit)].to_csv(PriceMaxTrainPathY + "/" + str(int(_)) + ".csv",index=False)
        History["Price Max: Train Y"][History["Entry ID"] == _] = PriceMaxTrainPathY + "/" + str(int(_)) + ".csv"
        XMax[int(len(XMax) * TrainSplit):].to_csv(PriceMaxTestPathX + "/" + str(int(_)) + ".csv",index=False)
        History["Price Max: Test X"][History["Entry ID"] == _] = PriceMaxTestPathX + "/" + str(int(_)) + ".csv"
        YMax[int(len(YMax) * TrainSplit):].to_csv(PriceMaxTestPathY + "/" + str(int(_)) + ".csv",index=False)
        History["Price Max: Test Y"][History["Entry ID"] == _] = PriceMaxTestPathY + "/" + str(int(_)) + ".csv"
        
    tempMin = None
    tempMax = None

dataMin = None
dataMax = None
dataAvg = None

History.to_csv("History.csv", index=False)


CPU times: user 48min 32s, sys: 7.56 s, total: 48min 40s
Wall time: 1h 13min 2s


In [None]:
Model = pandas.DataFrame(History["Entry ID"]).rename(columns={0:"Entry ID"})
for _ in modelList:
    Model["Price Min: " + _] = None
    Model["Price Min: " + _ + ": Train Loss"] = None
    Model["Price Min: " + _ + ": Test Loss"] = None
    Model["Price Min: " + _ + ": Train Time"] = None
    Model["Price Max: " + _] = None
    Model["Price Max: " + _ + ": Train Loss"] = None
    Model["Price Max: " + _ + ": Test Loss"] = None
    Model["Price Max: " + _ + ": Train Time"] = None

In [None]:
%%time
"""
Loop over considered models
"""
for _ in modelList:
    print("Training Model: " + _ )
    pathMin = "Model/" + _ + "/Price Min"
    pathMax = "Model/" + _ + "/Price Max"
    makeDir(pathMin)
    makeDir(pathMax)
    
    for i in History["Entry ID"][History["Trainable"] == True].astype(int).tolist():
        
        trainX = pandas.read_csv(History["Price Min: Train X"][History["Entry ID"] == i].tolist()[-1]).values
        trainY = pandas.read_csv(History["Price Min: Train Y"][History["Entry ID"] == i].tolist()[-1]).values
        testX = pandas.read_csv(History["Price Min: Test X"][History["Entry ID"] == i].tolist()[-1]).values
        testY = pandas.read_csv(History["Price Min: Test Y"][History["Entry ID"] == i].tolist()[-1]).values
        start_time = time.time()
        trainLoss, testLoss, weigths = locals()[modelList[_]](trainX[:,1:], trainY, testX[:,1:], testY)
        path = pathMin + "/" + str(i) + ".pkl"
        dumpModel(path, weigths)
        end_time = time.time() - start_time
        Model["Price Min: " + _][Model["Entry ID"] == i] = path
        Model["Price Min: " + _ + ": Train Loss"][Model["Entry ID"] == i] = trainLoss
        Model["Price Min: " + _ + ": Test Loss"][Model["Entry ID"] == i] = testLoss
        Model["Price Min: " + _ + ": Train Time"][Model["Entry ID"] == i] = end_time
        
        trainX = pandas.read_csv(History["Price Max: Train X"][History["Entry ID"] == i].tolist()[-1]).values
        trainY = pandas.read_csv(History["Price Max: Train Y"][History["Entry ID"] == i].tolist()[-1]).values
        testX = pandas.read_csv(History["Price Max: Test X"][History["Entry ID"] == i].tolist()[-1]).values
        testY = pandas.read_csv(History["Price Max: Test Y"][History["Entry ID"] == i].tolist()[-1]).values
        start_time = time.time()
        trainLoss, testLoss, weigths = locals()[modelList[_]](trainX[:,1:], trainY, testX[:,1:], testY)
        path = pathMax + "/" + str(i) + ".pkl"
        dumpModel(path, weigths)
        end_time = time.time() - start_time
        Model["Price Max: " + _][Model["Entry ID"] == i] = path
        Model["Price Max: " + _ + ": Train Loss"][Model["Entry ID"] == i] = trainLoss
        Model["Price Max: " + _ + ": Test Loss"][Model["Entry ID"] == i] = testLoss
        Model["Price Max: " + _ + ": Train Time"][Model["Entry ID"] == i] = end_time
print("Models has been trained succesfully, refer to History")
Model.to_csv("Model.csv", index=False)

In [None]:
Model = pandas.read_csv("Model.csv")
Results = list()
for _ in modelList:
    TrainMin = Model["Price Min: " + _ + ": Train Loss"].mean()
    TestMin = Model["Price Min: " + _ + ": Test Loss"].mean()
    TimeMin = Model["Price Min: " + _ + ": Train Time"].sum()
    TrainMax = Model["Price Max: " + _ + ": Train Loss"].mean()
    TestMax = Model["Price Max: " + _ + ": Test Loss"].mean()
    TimeMax = Model["Price Max: " + _ + ": Train Time"].sum()
    Results.append([_, TrainMin, TestMin, TimeMin, TrainMax, TestMax, TimeMax])

Results = pandas.DataFrame(Results).rename(columns = {0:"Model",
                                                      1:"Price Min: Train Loss",
                                                      2:"Price Min: Test Loss",
                                                      3:"Price Min: Train Time",
                                                      4:"Price Max: Train Loss",
                                                      5:"Price Max: Test Loss",
                                                      6:"Price Max: Train Time"})

for _ in Results.columns[Results.columns.str.contains("Loss")].tolist():
    Results[_][Results[_] > 1] = 1
Results

In [None]:
fig = px.bar(Results,
             x=Results["Model"],
             y=Results["Price Min: Train Loss"],
             labels=dict(x="Model", y="Loss", color="Labels"),
             color=px.Constant("Train Loss"),
             barmode='group',
             title = "Price Min: Train Loss vs Test Loss"
            )
fig.add_bar(x=Results["Model"],
            y=Results["Price Min: Test Loss"],
            name="Test Loss"
            )
fig.show()

In [None]:
fig = px.bar(Results,
             x=Results["Model"],
             y=Results["Price Max: Train Loss"],
             labels=dict(x="Date", y="Loss", color="Labels"),
             color=px.Constant("Train Loss"),
             barmode='group',
             title = "Price Max: Train Loss vs Test Loss"
            )
fig.add_bar(x=Results["Model"],
            y=Results["Price Max: Test Loss"],
            name="Test Loss"
            )
fig.show()

In [None]:
fig = px.bar(Results,
             x=Results["Model"],
             y=Results["Price Max: Train Time"],
             labels=dict(x="Model", y="Time", color="Labels"),
             barmode='group',
             title = "Training Time"
            )
fig.show()

In [None]:
Results.to_csv("Results.csv", index=False)

In [None]:
tempStorage = list()
path = "Analysis"
makeDir(path)
for _ in History["Entry ID"][History["Trainable"] == True]:
    Min = 0
    PriceMin_Max = History["Price Min: Safe Max"][History["Entry ID"] == _].iloc[0]
    PriceMax_Max = History["Price Max: Safe Max"][History["Entry ID"] == _].iloc[0]
    
    DataMin = pandas.read_csv(History["Price Min: Train X"][History["Entry ID"] == _].iloc[0])
    DataMax = pandas.read_csv(History["Price Max: Train X"][History["Entry ID"] == _].iloc[0])
    
    DataMin["T"] = pandas.read_csv(History["Price Min: Train Y"][History["Entry ID"] == _].iloc[0])
    temp = pandas.read_csv(History["Price Min: Test X"][History["Entry ID"] == _].iloc[0])
    temp["T"] = pandas.read_csv(History["Price Min: Test Y"][History["Entry ID"] == _].iloc[0])
    DataMin = pandas.concat([DataMin, temp])
    DataMin = DataMin[["Date", "T"]]
    temp = None
    
 
    DataMax["T"] = pandas.read_csv(History["Price Max: Train Y"][History["Entry ID"] == _].iloc[0])
    temp = pandas.read_csv(History["Price Max: Test X"][History["Entry ID"] == _].iloc[0])
    temp["T"] = pandas.read_csv(History["Price Max: Test Y"][History["Entry ID"] == _].iloc[0])
    DataMax = pandas.concat([DataMax, temp])
    DataMax = DataMax[["Date", "T"]]
    temp = None
    
    DataMin = DataMin.sort_values(["Date"],ascending=True).reset_index(drop=True)
    DataMax = DataMax.sort_values(["Date"],ascending=True).reset_index(drop=True)
    
    total = len(DataMin)
    Start = DataMin["Date"].iloc[0]
    Start = date(int(Start.split("-")[0]),
                 int(Start.split("-")[1]),
                 int(Start.split("-")[2]))
    End = date.today()
    Frequency = "w"
    
    DataMin = collectMissingData(DataMin, Start, End, Frequency)
    DataMax = collectMissingData(DataMax, Start, End, Frequency)
    
    DataMin["Predicted"] = numpy.nan
    DataMax["Predicted"] = numpy.nan
    
    LoadModelMin = pickle.load(open(Model["Price Min: Neural Network"][Model["Entry ID"] == _].iloc[0], 'rb'))
    LoadModelMax = pickle.load(open(Model["Price Max: Neural Network"][Model["Entry ID"] == _].iloc[0], 'rb'))
    
    MissingDataIndex = DataMin[DataMin["T"].isnull()].index.tolist()
    if len(MissingDataIndex) > 0:
        for i in MissingDataIndex:
            Date = DataMin["Date"][DataMin.index == i].iloc[0]
            MonthNumber = round(int(Date.split("-")[1])/12, 2)
            WeekNumber = get_week_of_month(int(Date.split("-")[0]),
                                           int(Date.split("-")[1]),
                                           int(Date.split("-")[2]))/5
            WeekNumber = WeekNumber[0]
            try: T_3 = DataMax["T"][DataMax.index == i - 3].fillna(0).iloc[0]
            except: T_3 = 0
            try: T_2 = DataMax["T"][DataMax.index == i - 2].fillna(0).iloc[0]
            except: T_2 = 0
            try: T_1 = DataMax["T"][DataMax.index == i - 1].fillna(0).iloc[0]
            except: T_1 = 0
            try:
                T = abs(LoadModelMin.predict(pandas.DataFrame([MonthNumber, WeekNumber, T_3, T_2, T_1]).T)[0])
                DataMin["T"][DataMin.index == i] = T
                DataMin["Predicted"][DataMin.index == i] = T
            except:
                T = 0
                DataMin["T"][DataMin.index == i] = T
                DataMin["Predicted"][DataMin.index == i] = T
                
    DataMin["T"] = DataMin["T"] * PriceMin_Max
    DataMin['Predicted'] = DataMin['Predicted'] * PriceMin_Max
    
    
    MissingDataIndex = DataMax[DataMax["T"].isnull()].index.tolist()
    if len(MissingDataIndex) > 0:
        for i in MissingDataIndex:
            Date = DataMax["Date"][DataMax.index == i].iloc[0]
            MonthNumber = round(int(Date.split("-")[1])/12, 2)
            WeekNumber = get_week_of_month(int(Date.split("-")[0]),
                                           int(Date.split("-")[1]),
                                           int(Date.split("-")[2]))/5
            WeekNumber = WeekNumber[0]
            try: T_3 = DataMax["T"][DataMax.index == i - 3].fillna(0).iloc[0]
            except: T_3 = 0
            try: T_2 = DataMax["T"][DataMax.index == i - 2].fillna(0).iloc[0]
            except: T_2 = 0
            try: T_1 = DataMax["T"][DataMax.index == i - 1].fillna(0).iloc[0]
            except: T_1 = 0
            try:
                T = abs(LoadModelMax.predict(pandas.DataFrame([MonthNumber, WeekNumber, T_3, T_2, T_1]).T)[0])
                DataMax["T"][DataMin.index == i] = T
                DataMax["Predicted"][DataMin.index == i] = T
            except:
                T = 0
                DataMax["T"][DataMin.index == i] = T
                DataMax["Predicted"][DataMin.index == i] = T
    DataMax["T"] = DataMax["T"] * PriceMax_Max
    DataMax['Predicted'] = DataMax['Predicted'] * PriceMax_Max
    
    percentage = History["Missing Percentage From Max Count"][History["Entry ID"] == _].iloc[0]
    grade = History["Grade"][History["Entry ID"] == _].iloc[0]
                                                            
    fig = px.line(
                 x=DataMin['Date'],
                 y=(DataMin['T'] + DataMax['T'])/2,
                 labels=dict(x="Date", y="Price", color="Labels"),
                 color=px.Constant("Price Average"),
                 title = str(_) +"/Missing Percentage-" + "{:.2f}".format(percentage) + "%" + "/Grade-" + grade
                    )
    fig['data'][0]['line']['color']='rgba(255, 0, 0, 0.4)'
    fig.add_scatter(x=DataMin['Date'],
                   y=DataMin['T'],
                   mode = "lines",
                   fill='tozeroy',
                   fillcolor= "rgba(255, 0, 0, 0.1)",
                   line={"color":"rgba(255, 0, 0, 0)"},
                   name="Price Min"
                   )
    fig.add_scatter(x=DataMax['Date'],
                    y=DataMax['T'],
                    mode = "lines",
                    fill='tozeroy',
                    fillcolor= "rgba(255, 0, 0, 0.1)",
                    line={"color":"rgba(255, 0, 0, 0)"},
                    name="Price Max"
               )
    fig.add_scatter(x=DataMax['Date'],
                    y=(DataMin['Predicted'] + DataMax['Predicted'])/2,
                    mode = "lines",
                    line={"color":"rgba(255, 0, 0, 1)"},
                    name="Price Average - Imputed",
                    marker = {'color':'red'}
                   )
    fig.add_scatter(x=DataMax['Date'],
                    y=DataMax['Predicted'].fillna(0),
                    mode = "lines",
                    fill='tozeroy',
                    fillcolor= "rgba(255, 0, 0, 0.3)",
                    line={"color":"rgba(255, 0, 0, 0)"},
                    name="Imputation Highlight"
               )
    #fig.show()
    fig.write_html(path + "/" + str(_) +"-"+grade+".html")

    temp = DataMin.rename(columns={"Date":"date","T":"price_min", "Predicted":"is_estimated"})
    temp["price_max"] = DataMax["T"]
    temp["is_estimated"] = temp["is_estimated"].fillna(False)
    temp["is_estimated"][temp["is_estimated"] != False] = True
    temp["price_avg"] = (temp["price_min"] + temp["price_max"])/2
    temp["entry_id"] = _
    temp["item_id"] = History["Item ID"][History["Entry ID"] == _].iloc[0]
    temp["item_unit_id"] = History["Item Unit ID"][History["Entry ID"] == _].iloc[0]
    temp["period"] = History["Period"][History["Entry ID"] == _].iloc[0]
    temp["currency"] = History["Currency"][History["Entry ID"] == _].iloc[0]
    error = (Model["Price Min: Neural Network: Test Loss"][Model["Entry ID"] == _].iloc[0] + Model["Price Max: Neural Network: Test Loss"][Model["Entry ID"] == _].iloc[0])/2
    temp["error_rate"] = error
    temp["change"] = ((temp["price_avg"] - temp["price_avg"].shift(-1))/temp["price_avg"].shift(-1)).fillna(0)
    temp["grade"] = History["Grade"][History["Entry ID"] == _].iloc[0]
    if error <= 0.30:
        temp["error_level"] = "Low"
    elif error > 0.30 and error <= 0.60:
        temp["error_level"] = "Medium"
    else:
        temp["error_level"] = "High"
    temp = temp[["date", "entry_id", "item_id", "item_unit_id", "price_min", "price_max", "price_avg",
                 "period", "change", "currency", "is_estimated", "error_rate", "error_level", "grade"]]
    tempStorage.append(temp)
    temp = None

In [None]:
data = tempStorage[0]
for _ in tempStorage[1:]:
    data = pandas.concat([data, _])
data = data.reset_index(drop=True)
data.to_csv("Output.csv")
print("Output has been generated")