# Training Model and testing the results  (Having Training and testing dataset)

### Step 1 - Import the required modules



In [1]:

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error  # import to calculate root mean square


### Step 2 - Read the data source

In [2]:
#Load the training data into Pandas DataFrame
SourceData=pd.read_excel("Supplier Past Performance.xlsx")
# Load the test data 
Testdata=pd.read_excel("Defect Predict.xlsx") 
print(SourceData)
print(Testdata)

     Purchase Order Amount  ...  Defect Percent
0                    33386  ...            4.90
1                    48477  ...            8.40
2                    31688  ...            1.20
3                    27115  ...            0.40
4                     9028  ...            0.10
..                     ...  ...             ...
694                  44313  ...            2.76
695                  38823  ...            9.87
696                  15804  ...            5.14
697                  47081  ...            8.39
698                  25606  ...            5.41

[699 rows x 4 columns]
     Purchase Order Amount  ...  PO Sent in Advance of Delivery
0                    42130  ...                            1009
1                    39952  ...                            1009
2                    18537  ...                            2494
3                    48981  ...                             570
4                    10271  ...                            2150
..              

### Step 3  - Declare the independent and dependent train data from the sample

In [3]:
#Drop depedent variable from training dataset
SourceData_train_independent= SourceData.drop(["Defect Percent"], axis=1) 
#New dataframe with only independent variable value for training dataset
SourceData_train_dependent=SourceData["Defect Percent"].copy() 

 ### Step 4  - Scale the independent test and train data

In [4]:
sc_X = StandardScaler()
# scale the independent variables
X_train=sc_X.fit_transform(SourceData_train_independent.values) 
# scale the independent variables
y_train=SourceData_train_dependent 
X_test=sc_X.transform(Testdata.values)

### Step 5  - Fit the test data in maching learning model - Support Vector Regressor


In [5]:
svm_reg = SVR(kernel="linear", C=1)
# fit and train the model
svm_reg.fit(X_train, y_train) 
predictions = svm_reg.predict(X_test)

In [6]:
print("Defect percent prediction by Support Vector model for the order value of 95827 GBP with 851 pallets sent 55 days before delivery data is " ,round(predictions[0],2) , "%")


Defect percent prediction by Support Vector model for the order value of 95827 GBP with 851 pallets sent 55 days before delivery data is  5.15 %


### Step 6 - Fit the test data in machine learning model - Decision Tree Model

In [7]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train) # fit and train the model
decision_predictions = tree_reg.predict(X_test) # Predict the value of dependent variable
print("Defect percent prediction by Decision Tree model for the order value of 95827 GBP with 851 pallets sent 55 days before delivery data is " ,round(decision_predictions[0],2) , "%")

Defect percent prediction by Decision Tree model for the order value of 95827 GBP with 851 pallets sent 55 days before delivery data is  0.81 %


# Saving the trained model and then loading it directly to predict the result on test data. (Splitting Train dataset)



###Import to have equal weigtage samples in training dataset


In [8]:
from sklearn.model_selection import StratifiedShuffleSplit 
import pickle

### Read the data source

In [9]:
SourceData=pd.read_excel("Supplier Past Performance.xlsx") # Load the data into Pandas DataFrame


### Declare the independent and dependent train data from the sample

In [10]:
# Drop depedent variable from training dataset
SourceData_independent= SourceData.drop(["Defect Percent"], axis=1)
# New dataframe with only independent variable value for training dataset
SourceData_dependent=SourceData["Defect Percent"].copy() 

In [11]:
#In the below code a new column “PO Category” is introduced to categorise “PO Amount” value from 0 to 30,000 GBP is classified as PO Category 1, from 30,000 to 60, 000 GBP as PO Category 2 and henceforth.
SourceData["PO Category"]=pd.cut(SourceData["Purchase Order Amount"],bins=[0., 30000, 60000, 90000,np.inf],labels=[1, 2, 3, 4])


In [12]:
# In the below code, we are reserving 30% of the data for testing the model and 60% for training it
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3)

for train_index, test_index in split.split(SourceData, SourceData["PO Category"]):
    strat_train_set = SourceData.loc[train_index]  # stratfied train dataset 
    strat_test_set = SourceData.loc[test_index] #stratified test dataset

#We introduced an additional column “PO Category” to ensure ample representation of purchase
#orders from all PO Amount range in the test and train dataset. As it is be been achieved, 
# hence we will delete this additional PO Category from our dataset.
for set_ in (strat_train_set, strat_test_set): 
    set_.drop("PO Category", axis=1, inplace=True)
    

### Define and test and training data-independent and dependent variables for our model.

In [13]:

SourceData_train_independent= strat_train_set.drop(["Defect Percent"], axis=1)
SourceData_train_dependent=strat_train_set["Defect Percent"].copy()
SourceData_test_independent= strat_test_set.drop(["Defect Percent"], axis=1)
SourceData_test_dependent=strat_test_set["Defect Percent"].copy()

###Scaling 

In [14]:
sc_X = StandardScaler()
X_train=sc_X.fit_transform(SourceData_train_independent.values)
y_train=SourceData_train_dependent
pickle.dump(sc_X, open("Scaler.sav", 'wb'))
X_test=sc_X.fit_transform(SourceData_test_independent.values)
y_test=SourceData_test_dependent

### Training and saving trained model

 ### Fit the test data in maching learning model - Support Vector Regressor

In [15]:

svm_reg = SVR(kernel="linear", C=1)
svm_reg.fit(X_train, y_train)
filename = 'SVR_TrainedModel.sav'
pickle.dump(svm_reg, open(filename, 'wb'),protocol=-1)

Predictions

In [16]:
decision_predictions = svm_reg.predict(X_test)
Score = (svm_reg.score(X_test, y_test))  # It provides the R-Squared Value
print ( "The R-SQUARED VALUE of the Support  Vector model is", round(Score,2))
lin_mse = mean_squared_error(y_test, decision_predictions)
print("Mean Square Error  of  Vector  model is ", round(lin_mse,2))
lin_rmse = mean_squared_error(y_test, decision_predictions, squared=False)
print("Root Mean Square Error of  Support  Vector  Learning model is ", round(lin_rmse,2))

The R-SQUARED VALUE of the Support  Vector model is -0.02
Mean Square Error  of  Vector  model is  8.18
Root Mean Square Error of  Support  Vector  Learning model is  2.86


### Fit the test data in maching learning model - Decision Tree Model

In [17]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
filename = 'DecisionTree_TrainedModel.sav'
pickle.dump(tree_reg, open(filename, 'wb'),protocol=-1)
predictions = tree_reg.predict(X_test) 
Score = (tree_reg.score(X_test, y_test))  # It provides the R-Squared Value
print ( "The R-SQUARED VALUE of model Decision Tree model is ", round(Score,2))
lin_mse = mean_squared_error(y_test, predictions)
print("Mean Square Error of Decision Tree model is ", round(lin_mse,2))
lin_rmse = mean_squared_error(y_test, decision_predictions, squared=False)
print("Root Mean Square Error of Decision Tree model is ", round(lin_rmse,2))

The R-SQUARED VALUE of model Decision Tree model is  -1.16
Mean Square Error of Decision Tree model is  17.32
Root Mean Square Error of Decision Tree model is  2.86


#Prediction

In [18]:
test_predictions = tree_reg.predict(X_test)
test_decision_predictions = svm_reg.predict(X_test)

In [19]:
testdata=pd.read_excel("Defect Predict.xlsx") # Load the test data
sc_X = pickle.load(open('Scaler.sav', 'rb'))  # Load the pickle
loaded_model = pickle.load(open('SVR_TrainedModel.sav', 'rb')) # load the trained model
X_test=sc_X.transform(testdata.values) # scale the independent variables for test data
decision_predictions = loaded_model.predict(X_test) # Predict the value of dependent variable
print("The prediction by SVM model is " , decision_predictions )

The prediction by SVM model is  [4.88639284 4.79129653 4.76536463 4.91677154 4.55779221 4.4931587
 4.9175923  4.84366474 5.03946448 5.02767773 4.85927906 5.13572255
 4.36735158 4.47591829 4.49425997 5.28788423 4.84891668 4.57796198
 4.72582343 4.97505424 4.77161275 4.8452619  4.28230658 4.81319799
 4.77776093 4.74665995 5.19173248 4.42038173 5.10155438 4.74377292
 4.74801525 4.97735649 4.65353845 4.93002375 4.88361771 4.64000277
 4.7688036  4.53478182 5.30347237 4.8290226  4.66197025 4.83552256
 4.89960009 4.73447444 5.0829939  5.09969825 5.11388813 4.89814822
 4.51398406 4.66433716 4.88400079 4.64292767 5.09347474 4.93351945
 4.57220373 4.77206839 4.64546332 4.58953575 4.64174982 4.63950604
 4.85462698 4.48404213 4.90304102 4.75180322 4.91677005 4.95130657
 4.85174252 4.41542569 5.04941551 5.1547981  5.14016366 4.8991561
 4.70911523 4.90858556 4.93204144 5.17899479 4.75798125 4.75923698
 4.75291275 4.87512809 4.8017508  4.64487044 4.73986082 4.87272161
 5.17578951 5.09632531 5.3101285

### Prediction using Decision tree model

In [20]:
testdata=pd.read_excel("Defect Predict.xlsx") # Load the test data
sc_X = pickle.load(open('Scaler.sav', 'rb'))  # Load the pickle
loaded_model = pickle.load(open('DecisionTree_TrainedModel.sav', 'rb')) # load the trained model
X_test=sc_X.transform(testdata.values) # scale the independent variables for test data
decision_predictions = loaded_model.predict(X_test) # Predict the value of dependent variable
print("The prediction by Decision Treemodel is " , decision_predictions )

The prediction by Decision Treemodel is  [9.87 4.23 7.72 8.15 1.22 4.41 3.65 5.41 8.5  0.44 1.02 4.9  6.36 1.08
 4.16 4.79 0.44 4.   3.57 2.83 5.93 7.34 6.36 6.62 2.84 6.27 2.27 2.79
 1.84 2.14 9.89 3.38 4.23 0.44 3.65 6.81 5.41 2.04 3.8  9.7  7.51 2.83
 5.75 9.81 6.4  2.27 5.75 4.23 0.1  9.31 5.35 7.69 4.33 8.43 8.15 8.88
 0.1  4.41 7.69 8.15 0.07 0.1  7.72 1.29 6.4  5.41 8.39 0.1  6.27 0.44
 4.79 8.5  0.89 6.62 6.4  1.76 4.23 3.57 6.99 2.27 3.65 4.53 0.4  6.62
 3.31 8.39 4.79 5.41 6.4  6.4  0.12 1.76 4.16 4.79 4.41 4.9  8.42 8.48
 6.4  0.87 5.34 8.43 4.41 3.94 1.74 5.34 6.16 9.81 4.23 8.88 6.08 2.83
 5.34 2.83 2.83 9.31 3.38 8.42 0.44 7.72 8.48 2.83 0.89 3.69 9.7  5.93
 7.64 7.72 5.93 8.28 6.4  0.44 6.4  5.41 1.23 4.63 0.12 5.11 1.98 8.97
 5.93 1.92 9.89 9.87 0.5  5.75 0.12 5.49 3.69 9.89 5.41 6.4  6.36 4.41
 0.89 6.62 3.07 5.75 6.08 6.27 7.72 6.36 2.27 8.5  8.5  4.41 7.72 7.85
 0.89 0.87 5.35 0.87 4.63 6.27 4.48 8.39 3.8  3.65 6.81 6.4  1.98 5.93
 8.39 2.83 3.56 3.69 2.73 7.85 8.88 