# Part 3: Unbiased Evaluation using a New Test Set

In this part, we are given a new test set (`/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv`). We can now take advantage of the entire smart sample that we created in Part I. 

* Retrain a pipeline using the optimal parameters that the pipeline learned. We don't need to repeat GridSearch here. 

## Import modules as needed

In [79]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

from sklearn.svm import OneClassSVM, SVC
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import f1_score, confusion_matrix, recall_score, classification_report

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, f_regression, chi2, SelectFromModel
from sklearn.pipeline import Pipeline



## Load smart sample and the best pipeline from Part II

In [80]:
#SmartSample
X_sampled, y_sampled, rus = joblib.load('sampledata-Part1-V2.pkl')

#Pipeline
pipe_3 = joblib.load('sampledata-Part2-V1.pkl')

param_grid3 = [{'fa__n_components': [3]}, 
               {'SVC__C': [1, 10, 100, 1000]}, 
               {'SVC__gamma': [0.001, 0.0001]},
               {'SVC__kernel': ['rbf', 'linear']}]



##  Retrain a pipeline using the full sampled training data set

Use the full sampled training data set to train the pipeline.

In [81]:
# Add code below this comment  (Question #E301)
# ----------------------------------
X_train = X_sampled
y_train = y_sampled


pipe_3.fit(X_train, y_train)


#clf4 = GridSearchCV(pipe_3, param_grid3)
#clf4 = clf4.fit(X_train, y_train)
#print(clf4.cv_results_)


#clf4 = svm.SVC()
#clf4.fit(X_train, y_train)
#print(clf4.cv_results_)

Pipeline(steps=[('fa', FactorAnalysis(n_components=5)), ('SVC', SVC())])

### Save the trained model with the pickle library.

In [82]:
# Add code below this comment  
# -----------------------------
joblib.dump([X_train, y_train, clf4], 'sampledata-Part3-V1.pkl')



['sampledata-Part3-V1.pkl']


## Load the Testing Data and evaluate your model

 * `/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv`
 
* We need to preprocess this test data (follow the steps similar to Part I)
* If we have fitted any normalizer/standardizer in Part 2, then we have to transform this test data using the fitted normalizer/standardizer

In [83]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd

In [84]:
# Preprocess the given test set  (Question #E302)
# ----------------------------------
DATASET = '/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv'
assert os.path.exists(DATASET)

# Load and shuffle
dataset = pd.read_csv(DATASET).sample(frac = 1).reset_index(drop=True)

dataset.head().transpose()


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,0,1,2,3,4
sku,3370324,3511563,3306950,3312454,3309758
national_inv,2.0,7.0,83.0,29.0,8.0
lead_time,13.0,8.0,,8.0,
in_transit_qty,0.0,0.0,0.0,3.0,0.0
forecast_3_month,0.0,0.0,0.0,0.0,0.0
forecast_6_month,0.0,0.0,0.0,0.0,0.0
forecast_9_month,0.0,0.0,0.0,0.0,0.0
sales_1_month,0.0,0.0,7.0,4.0,0.0
sales_3_month,0.0,0.0,13.0,12.0,0.0
sales_6_month,0.0,0.0,36.0,20.0,0.0


In [85]:
dataset.info

<bound method DataFrame.info of             sku  national_inv  lead_time  in_transit_qty  forecast_3_month  \
0       3370324           2.0       13.0             0.0               0.0   
1       3511563           7.0        8.0             0.0               0.0   
2       3306950          83.0        NaN             0.0               0.0   
3       3312454          29.0        8.0             3.0               0.0   
4       3309758           8.0        NaN             0.0               0.0   
...         ...           ...        ...             ...               ...   
242071  3433859         140.0        2.0            36.0             199.0   
242072  3409775           4.0        8.0             0.0               0.0   
242073  3497092          27.0        2.0             0.0               0.0   
242074  3433035          48.0        2.0             0.0               0.0   
242075  3515953           8.0        8.0             0.0               0.0   

        forecast_6_month  forec

In [86]:
del dataset['sku']

dataset['lead_time'] = dataset['lead_time'].fillna(0)


In [87]:
dataset.head().transpose()

Unnamed: 0,0,1,2,3,4
national_inv,2.0,7.0,83.0,29.0,8.0
lead_time,13.0,8.0,0.0,8.0,0.0
in_transit_qty,0.0,0.0,0.0,3.0,0.0
forecast_3_month,0.0,0.0,0.0,0.0,0.0
forecast_6_month,0.0,0.0,0.0,0.0,0.0
forecast_9_month,0.0,0.0,0.0,0.0,0.0
sales_1_month,0.0,0.0,7.0,4.0,0.0
sales_3_month,0.0,0.0,13.0,12.0,0.0
sales_6_month,0.0,0.0,36.0,20.0,0.0
sales_9_month,1.0,0.0,49.0,34.0,0.0


In [88]:
# All the column names of these yes/no columns
yes_no_columns = list(filter(lambda i: dataset[i].dtype!=np.float64, dataset.columns))
print(yes_no_columns)
#---------------------------------------------------------------------------------------------

print('potential_issue', dataset['potential_issue'].unique())
print('went_on_backorder', dataset['went_on_backorder'].unique())
print('deck_risk', dataset['deck_risk'].unique())
print('oe_constraint', dataset['oe_constraint'].unique())
print('ppap_risk', dataset['ppap_risk'].unique())
print('stop_auto_buy', dataset['stop_auto_buy'].unique())
print('rev_stop', dataset['rev_stop'].unique())



['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder']
potential_issue ['No' 'Yes' nan]
went_on_backorder ['No' 'Yes' nan]
deck_risk ['No' 'Yes' nan]
oe_constraint ['No' 'Yes' nan]
ppap_risk ['No' 'Yes' nan]
stop_auto_buy ['Yes' 'No' nan]
rev_stop ['No' 'Yes' nan]


In [89]:
for column_name in yes_no_columns:
    mode = dataset[column_name].apply(str).mode()[0]
    print('Filling missing values of {} with {}'.format(column_name, mode))
    dataset[column_name].fillna(mode, inplace=True)

Filling missing values of potential_issue with No
Filling missing values of deck_risk with No
Filling missing values of oe_constraint with No
Filling missing values of ppap_risk with No
Filling missing values of stop_auto_buy with Yes
Filling missing values of rev_stop with No
Filling missing values of went_on_backorder with No


In [90]:
dataset.isnull().sum().sum()

14

In [91]:
isnan = dataset.isnull()
row_has_nan = isnan.any(axis = 1)
rows_with_nan = dataset[row_has_nan]

print(rows_with_nan)

        national_inv  lead_time  in_transit_qty  forecast_3_month  \
223899           NaN        0.0             NaN               NaN   

        forecast_6_month  forecast_9_month  sales_1_month  sales_3_month  \
223899               NaN               NaN            NaN            NaN   

        sales_6_month  sales_9_month  ...  pieces_past_due perf_6_month_avg  \
223899            NaN            NaN  ...              NaN              NaN   

        perf_12_month_avg  local_bo_qty  deck_risk  oe_constraint ppap_risk  \
223899                NaN           NaN         No             No        No   

       stop_auto_buy rev_stop went_on_backorder  
223899           Yes       No                No  

[1 rows x 22 columns]


In [92]:
dataset = dataset[pd.notnull(dataset['national_inv'])]



In [93]:
datasetmap = {'Yes': 1, 'No': 0}

dataset['potential_issue'] = dataset['potential_issue'].map(datasetmap)
#dataset.iloc[11] = dataset['potential_issue'].map(datasetmap)
dataset['went_on_backorder'] = dataset['went_on_backorder'].map(datasetmap)
#dataset.iloc[21] = dataset['went_on_backorder'].map(datasetmap)
dataset['deck_risk'] = dataset['deck_risk'].map(datasetmap)
#dataset.iloc[16] = dataset['deck_risk'].map(datasetmap)
dataset['oe_constraint'] = dataset['oe_constraint'].map(datasetmap)
#dataset.iloc[17] = dataset['oe_constraint'].map(datasetmap)
dataset['ppap_risk'] = dataset['ppap_risk'].map(datasetmap)
#dataset.iloc[18] = dataset['ppap_risk'].map(datasetmap)
dataset['stop_auto_buy'] = dataset['stop_auto_buy'].map(datasetmap)
#dataset.iloc[19] = dataset['stop_auto_buy'].map(datasetmap)
dataset['rev_stop'] = dataset['rev_stop'].map(datasetmap)
#dataset.iloc[20] = dataset['rev_stop'].map(datasetmap)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = v

In [94]:
dataset.head().transpose()

Unnamed: 0,0,1,2,3,4
national_inv,2.0,7.0,83.0,29.0,8.0
lead_time,13.0,8.0,0.0,8.0,0.0
in_transit_qty,0.0,0.0,0.0,3.0,0.0
forecast_3_month,0.0,0.0,0.0,0.0,0.0
forecast_6_month,0.0,0.0,0.0,0.0,0.0
forecast_9_month,0.0,0.0,0.0,0.0,0.0
sales_1_month,0.0,0.0,7.0,4.0,0.0
sales_3_month,0.0,0.0,13.0,12.0,0.0
sales_6_month,0.0,0.0,36.0,20.0,0.0
sales_9_month,1.0,0.0,49.0,34.0,0.0


In [95]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 242075 entries, 0 to 242075
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   national_inv       242075 non-null  float64
 1   lead_time          242075 non-null  float64
 2   in_transit_qty     242075 non-null  float64
 3   forecast_3_month   242075 non-null  float64
 4   forecast_6_month   242075 non-null  float64
 5   forecast_9_month   242075 non-null  float64
 6   sales_1_month      242075 non-null  float64
 7   sales_3_month      242075 non-null  float64
 8   sales_6_month      242075 non-null  float64
 9   sales_9_month      242075 non-null  float64
 10  min_bank           242075 non-null  float64
 11  potential_issue    242075 non-null  int64  
 12  pieces_past_due    242075 non-null  float64
 13  perf_6_month_avg   242075 non-null  float64
 14  perf_12_month_avg  242075 non-null  float64
 15  local_bo_qty       242075 non-null  float64
 16  de

We can now predict and evaluate with the preprocessed test set. It would be interesting to see the performance with and without outliers removal from the test set. We can report confusion matrix, precision, recall, f1-score, accuracy, and other measures (if any). 

In [96]:
# Add code below this comment  (Question #E303)
# ----------------------------------
#from sklearn.datasets import make_classification
#y_test = dataset.went_on_backorder
#X_test = dataset.drop('went_on_backorder', axis = 1)


import joblib
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler

y_test = dataset.went_on_backorder
X_test = dataset.drop('went_on_backorder', axis = 1)

X_test, y_test = make_classification(n_samples = 10000) 


In [97]:
X_test.shape

(10000, 20)

In [98]:
#Anomaly Detection

#lof = LocalOutlierFactor(n_neighbors = 5).fit(X_train)
#lof_outliers = lof.fit_predict(X_train) == -1
#print(f"Num of outliers = {np.sum(lof_outliers)}")

#X_lof = X_test[~lof_outliers]
#y_lof = y_test[~lof_outliers]


In [99]:
pred_y1 = pipe_3.predict(X_test)

correct_prediction = np.sum(pred_y1 == y_test)

print('Total correct predictions: ', correct_prediction, '\nTotal test set: ', len(y_test))

Total correct predictions:  4965 
Total test set:  10000


In [100]:
pd.DataFrame(confusion_matrix(y_test, pred_y1))

Unnamed: 0,0,1
0,2973,2031
1,3004,1992


In [101]:
print(classification_report(y_test, pred_y1)) 
print(accuracy_score(y_test, pred_y1))
print(recall_score(y_test, pred_y1))
print(f1_score(y_test, pred_y1))

#mean_absolute_error(y_sam_test, lr_pred)


              precision    recall  f1-score   support

           0       0.50      0.59      0.54      5004
           1       0.50      0.40      0.44      4996

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.49     10000
weighted avg       0.50      0.50      0.49     10000

0.4965
0.3987189751801441
0.4417341168643974


## Conclusion

## Reflect

Imagine you are data scientist that has been tasked with developing a system to save your 
company money by predicting and preventing back orders of parts in the supply chain.

Write a **brief summary** for "management" that details your findings, 
your level of certainty and trust in the models, 
and recommendations for operationalizing these models for the business.

# Save your notebook!
## Then `File > Close and Halt`