# Paper AIOM

In [72]:
# Importing the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import random as rd
import matplotlib.pyplot as plt


import matplotlib as mpl
import statsmodels.api as sm
%matplotlib inline 
import utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


rd.seed(123)

## About the Problem

A company based in Curitiba (Brazil) is developing a machine that analyzes the quality of soybeans using computer vision. Currently the classification is developed by human workers. Thus, we seek to analyze the logistical impact (queue time and processing time) based on the inclusion of this machine. The process of analyzing the quality of grains is represented in the figure below.

<p align="center">
  <img src="images/MFV.png" />
</p>

For the analysis, a simulation model in AnyLogic was developed.
<p align="center">
  <img src="images/simulation.png" />
</p>

As input, historical data from performing analyzes were included. As the aim is to analyze the logistical impact, a parameter variation approach was adopted to understand the best formation for the problem. The chosen parameters are:

- **Workers**: quantity $w$ of workers available to perform the task (1, 3 or 5);
- **Machines**: quantity $m$ of machines available to perform the task (1,3 or 5);
- **Worker wait**: a boolean variable. If $m < w$, the workers must choose whether to wait for the machine to finish analyzing grains before using it or to analyze in parallel (not using the machine);
- **Human Classification Time**: can be 195 or 615 seconds, depending on the applied method;
- **Machine Classification Time**: it is 120 seconds, but the new improvement achieved 60 seconds. Both cases were simulated.

All the simulated scenarios are presented below.

In [73]:
scenarios = pd.read_excel("Data/scenarios.xlsx")
scenarios

Unnamed: 0,Scenarios,Workers,Machines,Worker Wait,Human Classification Time,Machine Classification Time
0,Scenario 1,1,0,-,195,-
1,Scenario 2,1,0,-,615,-
2,Scenario 3,3,0,-,195,-
3,Scenario 4,3,0,-,615,-
4,Scenario 5,5,0,-,195,-
5,Scenario 6,5,0,-,615,-
6,Scenario 7,1,1,True,195,60
7,Scenario 8,1,1,True,615,60
8,Scenario 9,1,1,True,195,120
9,Scenario 10,1,1,True,615,120


### Objective

Using the simulated data, apply machine learning methods to predict the estimated queue, processing and total time for each of the 54 scenarios.


## Data Preprocessing

In [74]:
# Importing Data
data = pd.read_csv("Data/dataset.csv")
data = data.loc[:, data.columns!='Unnamed: 0']
print(f"Nº of observations: {len(data)}")
data.head()

Nº of observations: 4678380


Unnamed: 0,workers,workersWaiting,machineProcTime,qntMachines,cutBean,month,week,day,dayofweek,queueTime,procTime,totalTime
0,1.0,0,1.0,1.0,0.0,1,1,3,2,0.0,6.538521,6.538521
1,1.0,0,1.0,1.0,0.0,1,1,3,2,0.0,6.281081,6.281081
2,1.0,0,1.0,1.0,0.0,1,1,3,2,0.0,6.499349,6.499349
3,1.0,0,1.0,1.0,0.0,1,1,3,2,0.0,4.773382,4.773382
4,1.0,0,1.0,1.0,0.0,1,1,3,2,0.0,6.409866,6.409866


### Selecting Queue Model

In [71]:
# Splitting dataset into the training set and test set
featuresQ = data.iloc[:, :-3].values
queue = data.iloc[:, -3].values

X_train_q, X_test_q, y_train_q, y_test_q = train_test_split(featuresQ, queue, test_size = 0.2, random_state = 0)

# Feature Scaling
sc1 = StandardScaler()
X_train_q = sc1.fit_transform(X_train_q)
X_test_q = sc1.transform(X_test_q)
models_q = utils.createModels(X_train_q, y_train_q, X_test_q, y_test_q)
utils.printModels(models_q)

                          RMSE
Linear Regression  5232.081923
Random Forest       144.411602
Gradient Boosting  3099.257985
Decision Tree       141.927413


In [75]:
# Returning best model and RMSE
queue_model, queue_rmse = utils.returnBestModel(models_q)
print(f"Selected model = {queue_model}")
print(f"RMSE = {queue_rmse}")

Selected model = DecisionTreeRegressor(random_state=0)
RMSE = 141.92741344244303


### Selecting Processing Model

In [76]:
# Splitting dataset into the training set and test set
featuresP = data.iloc[:, :-3].values
process = data.iloc[:, -2].values

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(featuresP, process, test_size = 0.2, random_state = 0)

# Feature Scaling
sc2 = StandardScaler()
X_train_p = sc2.fit_transform(X_train_p)
X_test_p = sc2.transform(X_test_p)
models_p = utils.createModels(X_train_p, y_train_p, X_test_p,y_test_p)
utils.printModels(models_p)
process_model, process_rmse = utils.returnBestModel(models_p)
print(f"Selected model = {process_model}")
print(f"RMSE = {process_rmse}")

                       RMSE
Linear Regression  1.846668
Random Forest      0.749825
Gradient Boosting  0.850938
Decision Tree      0.749903
Selected model = RandomForestRegressor(n_estimators=40, random_state=42)
RMSE = 0.7498252239340687


In [77]:
# Returning best model and RMSE
process_model, process_rmse = utils.returnBestModel(models_p)
print(f"Selected model = {process_model}")
print(f"RMSE = {process_rmse}")

Selected model = RandomForestRegressor(n_estimators=40, random_state=42)
RMSE = 0.7498252239340687


## Prodecting Next Year

In [78]:
# Creating scenarios do forecast
toForecast = pd.read_excel("Data/toForecast.xlsx")
workersRange = np.unique(data.workers)
workersWaitingRange = np.unique(data.workersWaiting)
machineProcTimeRange = np.unique(data.machineProcTime)
qntMachinesRange = np.unique(data.qntMachines)
cutBeanRange = np.unique(data.cutBean)
forecasts = list()

for work in workersRange:
    for wait in workersWaitingRange:
        for machPro in machineProcTimeRange:
            for qntMa in qntMachinesRange:
                for cut in cutBeanRange:
                    copy = toForecast.copy()
                    copy['workers'] = work
                    copy['workersWaiting'] = wait
                    copy['machineProcTime'] = machPro
                    copy['qntMachines'] = qntMa
                    copy['cutBean'] = cut

                    forecasts.append(copy)

In [81]:
# Create a copy to every forecast.
queueForecasting = forecasts.copy()
processForecasting = forecasts.copy()


# Queue Time Forecasting
queueResults = list()
for fore in queueForecasting:
    fore = sc1.transform(fore)
    y_pred = queue_model.predict(fore)
    queueResults.append(y_pred)

# Process Time Forecasting
procResults = list()
for fore1 in processForecasting:
    fore1 = sc2.transform(fore1)
    y_pred1 = process_model.predict(fore1)
    procResults.append(y_pred1)




In [90]:
forecasted = forecasts.copy()

In [91]:
index = 0
for df in forecasted:
    df['Queue time'] = queueResults[index]
    df['Processing time'] = procResults[index]
    index +=1


In [96]:
forecasted[0].head()

Unnamed: 0,workers,workersWaiting,machineProcTime,qntMachines,cutBean,month,week,day,dayofweek,Queue time,Processing time
0,1.0,0,1.0,0.0,0.0,1,1,1,1,0.211747,8.516245
1,1.0,0,1.0,0.0,0.0,1,1,2,2,2.211209,8.406851
2,1.0,0,1.0,0.0,0.0,1,1,3,3,17.767954,8.613146
3,1.0,0,1.0,0.0,0.0,1,1,4,4,2.734566,8.462573
4,1.0,0,1.0,0.0,0.0,1,1,5,5,1.797137,8.028204


In [None]:
import matplotlib.pyplot as plt
forecasted[0].plot(x="Queue time", y="week")
