<a href="https://colab.research.google.com/github/emgeiger/PlasticScanner/blob/tensorflow-integration/PlasticScanner_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf datasets
!rm -rf classes
!rm -rf resources
!git clone --depth 1 https://github.com/Plastic-Scanner/data
!mv data/data ./datasets
!rm -rf data
!git clone --depth 1 https://github.com/DataWorm/plastic-identifier.git
!mv plastic-identifier/classes ./
!mv plastic-identifier/resources ./
!rm -rf plastic-identifier

Cloning into 'data'...
remote: Enumerating objects: 1091, done.[K
remote: Counting objects: 100% (1091/1091), done.[K
remote: Compressing objects: 100% (1075/1075), done.[K
remote: Total 1091 (delta 16), reused 1058 (delta 14), pack-reused 0[K
Receiving objects: 100% (1091/1091), 38.32 MiB | 7.64 MiB/s, done.
Resolving deltas: 100% (16/16), done.
Cloning into 'plastic-identifier'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 20 (delta 5), reused 9 (delta 2), pack-reused 0[K
Unpacking objects: 100% (20/20), 8.65 KiB | 421.00 KiB/s, done.


In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D

from classes.SpectralDataset import SpectralDataset
from classes.DatasetLoader import DatasetLoader
from classes.PlasticScannerDatasetLoader import PlasticScannerDatasetLoader
from classes.AvantesDatasetLoader import AvantesDatasetLoader
from classes.ReremeterAnnotatedDatasetLoader import ReremeterAnnotatedDatasetLoader
from classes.NoOpTransformer import NoOpTransformer
from classes.SpectralonCalibrationTransformer import SpectralonCalibrationTransformer
from classes.DatasetAppenderTransformer import DatasetAppenderTransformer
from classes.WavelengthFilter import WavelengthFilter

In [None]:
def extractFeatures(X : np.ndarray):
    first_column_transposed = np.reshape(X[:, 0], (X.shape[0],1))
    with np.errstate(divide='ignore', invalid='ignore'):
        x = np.true_divide(X[:, 1:], first_column_transposed)
        return np.nan_to_num(x)

def extractFeatures2(X : np.ndarray):
    last_column_transposed = np.reshape(X[:, -1], (X.shape[0],1))
    with np.errstate(divide='ignore', invalid='ignore'):
        x = np.true_divide(X[:, :-1], last_column_transposed)
        return np.nan_to_num(x)

def extractFeatures3(X : np.ndarray):
    features = np.zeros((X.shape[0], X.shape[1]-1))
    for i in range(X.shape[1]-1):
        with np.errstate(divide='ignore', invalid='ignore'):
            features[:, i] = np.true_divide(X[:, i], X[:, i+1])
    return np.nan_to_num(features)

def get_datasets(train : DatasetLoader, test : DatasetLoader, test_size = 0.3):
    dataset = train.load()
    if train != test:
        return (dataset, test.load())
    X_train, X_test, y_train, y_test = train_test_split(dataset.X, dataset.y, test_size=test_size, random_state=None, stratify=dataset.y)
    return (SpectralDataset(dataset.wavelengths, X_train, y_train), SpectralDataset(dataset.wavelengths, X_test, y_test))


In [None]:
material_blacklist = ['PVC', 'ABS', 'PLA', 'PC', 'PMMA', 'ASA', 'PACF', 'PCCF', 'PETG']

# Load list of materials to be considered for training and map them to numbers
materials = pd.read_csv('resources/materials.csv', index_col='label', names=['label', 'materialNumber'], squeeze=True).to_dict()
materials = {key: materials[key] for key in materials if key not in material_blacklist}
number_material_map = {v: k for k, v in materials.items()}
print('Materials: %s' % materials)

datasetFile1 = 'datasets/20220531_DB2.1_first_dataset/board01/DB2.1_#01board_combined.csv'
datasetFile1Train = 'resources/DB2.1 #01 board - Sheet1_train.csv'
datasetFile1Test = 'resources/DB2.1 #01 board - Sheet1_test.csv'
datasetFile2 = 'datasets/20220531_DB2.1_first_dataset/board03/combined.csv'
datasetFileAvantes = 'datasets/20221003_avantes_spectrometer/data'
datasetFileAvantes5th = 'datasets/20230419_avantes_spectrometer_fifth_dataset'
datasetFileReremeter = 'datasets/20230109_Reremeter/annotated_data.csv'

# Specify wavelengths for training or set to None to use wavelengths from training dataset
#wavelengths = None
#wavelengths = np.array([1050, 1200, 1300, 1450, 1550, 1650, 1720])
#wavelengths = np.array([1200, 1300, 1450, 1550, 1650])
#wavelengths = np.array([1180, 1200, 1215, 1680, 1700, 1720, 1740])
wavelengths = np.array([1150, 1185, 1200, 1215, 1225, 1400, 1420, 1500, 1520, 1685, 1700, 1715, 1730])

# Define which datasets shall be loaded for train and test data (if same dataset is used for train and test, data will be splitted)
#dsloader_train = PlasticScannerDatasetLoader(datasetFile1, materials)
#dsloader_train = AvantesDatasetLoader(datasetFileAvantes, materials)
dsloader_train = AvantesDatasetLoader(datasetFileAvantes5th, materials)
#dsloader_train = ReremeterAnnotatedDatasetLoader(datasetFileReremeter, materials)
#dsloader_test = PlasticScannerDatasetLoader(datasetFile2, materials)
dsloader_test = AvantesDatasetLoader(datasetFileAvantes, materials)
#dsloader_test = ReremeterAnnotatedDatasetLoader(datasetFileReremeter, materials)
(ds_train, ds_test) = get_datasets(dsloader_train, dsloader_test)

if wavelengths is None:
    wavelengths = ds_train.wavelengths
wavelengths = wavelengths[wavelengths < 1755]
#wavelengthRangeFilter = np.vectorize(lambda x: (1175 < x and x < 1225) or (1675 < x and x < 1750))
#wavelengths = wavelengths[wavelengthRangeFilter(wavelengths)]
#wavelengths = wavelengths[0::4]
print('Used wavelengths: %s' % wavelengths)

#calibrator = SpectralonCalibrationTransformer() # Uses averaged spectralon measurements in dataset to calibrate/scale all measurements in dataset
calibrator = NoOpTransformer()
wavelength_filter = WavelengthFilter(wavelengths) # Filters/interpolates dataset if possible so that it only contains the wavelength values used for training
ds_train = wavelength_filter.transform(calibrator.transform(ds_train))
ds_test = wavelength_filter.transform(calibrator.transform(ds_test))

# Extract features from measurements
X_train = extractFeatures3(ds_train.X)
X_test = extractFeatures3(ds_test.X)
y_train = ds_train.y
y_test = ds_test.y

print('Amount of samples (train/test):')
for materialLabel, materialNumber in materials.items():
    if not np.any(y_train == materialNumber):
        continue
    print('%s: %s|%s' % (materialLabel, np.count_nonzero(y_train == materialNumber), np.count_nonzero(y_test == materialNumber)))


The squeeze argument has been deprecated and will be removed in a future version. Append .squeeze("columns") to the call to squeeze.





Materials: {'Spectralon': 0, 'spectralon': 0, 'PET': 1, 'HDPE': 2, 'LDPE': 4, 'PP': 5, 'PS': 6}
Used wavelengths: [1150 1185 1200 1215 1225 1400 1420 1500 1520 1685 1700 1715 1730]
Amount of samples (train/test):
PET: 105|20
HDPE: 105|21
PP: 171|19
PS: 57|17


In [None]:
labels = np.vectorize(number_material_map.get)(y_train)
fig = px.scatter_3d(x=X_train[:,0], y=X_train[:,1], z=X_train[:,2], color=labels)
fig.update_scenes(yaxis_autorange="reversed")
fig.show()

labels = np.vectorize(number_material_map.get)(y_train)
fig = px.scatter_3d(x=X_train[:,-1], y=X_train[:,-2], z=X_train[:,-3], color=labels)
fig.update_scenes(yaxis_autorange="reversed")
fig.show()

In [None]:
# train many models with different hyper parameters and output the scores

def precision_of_class(y_test, y_pred, material_number):
  mask_test = y_test == material_number
  #mask_pred = y_pred == material_number
  y_test = y_test[mask_test]
  y_pred = y_pred[mask_test]
  samples = len(y_test)
  if samples == 0:
    return 0
  return np.count_nonzero(y_test == y_pred) / samples

def print_material_accuracies(y_test, y_pred):
  for materialLabel, materialNumber in materials.items():
      if not np.any(y_train == materialNumber):
          continue
      print('%s: %.2f' % (materialLabel, precision_of_class(y_test, y_pred, materialNumber)), end=' | ')
  print('')



kernels = ['linear', 'poly', 'rbf']
params_c = [0.1, 1, 10, 100, 1000, 10000]
for kernel in kernels:
    for c in params_c:
        model = SVC(kernel=kernel, C=c, random_state=1)
        clf = model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print('SVM [%s C=%7.1f]: %.3f' % (kernel, c, accuracy_score(y_test, y_pred)), end='\t - \t')
        print_material_accuracies(y_test, y_pred)


criterions = ['gini', 'entropy']
params_n = [10,20,30,40,50,60,70,80,90,100,150,200,250,500]
for criterion in criterions:
    for n in params_n:
        model = RandomForestClassifier(n_estimators=n, criterion=criterion, random_state=1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print('RF [%s, n=%f]: %.3f' % (criterion, n, accuracy_score(y_test, y_pred)), end='\t - \t')
        print_material_accuracies(y_test, y_pred)

params_n = [10, 25, 50, 75, 100]
learning_rates = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
max_features = min(X_train.shape[1]+1, 10)
params_max_features = list(range(1, max_features, 2))
params_max_depth = [2,3,4]
for learning_rate in learning_rates:
    for n in params_n:
        for max_features in params_max_features:
            for max_depth in params_max_depth:
                model = GradientBoostingClassifier(n_estimators=n, learning_rate=learning_rate, max_features=max_features, max_depth=max_depth, random_state=1)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                print('GBC [lr=%f, n=%f, max_f=%f, max_d=%f]: %.3f' % (learning_rate, n, max_features, max_depth, accuracy_score(y_test, y_pred)), end='\t - \t')
                print_material_accuracies(y_test, y_pred)


SVM [linear C=    0.1]: 0.456	 - 	PET: 0.95 | HDPE: 0.52 | PP: 0.32 | PS: 0.00 | 
SVM [linear C=    1.0]: 0.709	 - 	PET: 0.95 | HDPE: 0.95 | PP: 0.16 | PS: 0.82 | 
SVM [linear C=   10.0]: 0.797	 - 	PET: 0.95 | HDPE: 0.95 | PP: 0.53 | PS: 0.82 | 
SVM [linear C=  100.0]: 0.886	 - 	PET: 0.95 | HDPE: 0.86 | PP: 0.95 | PS: 0.88 | 
SVM [linear C= 1000.0]: 0.861	 - 	PET: 0.95 | HDPE: 0.81 | PP: 1.00 | PS: 0.76 | 
SVM [linear C=10000.0]: 0.861	 - 	PET: 0.95 | HDPE: 0.86 | PP: 1.00 | PS: 0.71 | 
SVM [poly C=    0.1]: 0.810	 - 	PET: 0.95 | HDPE: 0.57 | PP: 0.95 | PS: 0.88 | 
SVM [poly C=    1.0]: 0.759	 - 	PET: 0.95 | HDPE: 0.52 | PP: 0.95 | PS: 0.71 | 
SVM [poly C=   10.0]: 0.848	 - 	PET: 0.95 | HDPE: 0.81 | PP: 1.00 | PS: 0.71 | 
SVM [poly C=  100.0]: 0.823	 - 	PET: 0.95 | HDPE: 0.90 | PP: 0.79 | PS: 0.71 | 
SVM [poly C= 1000.0]: 0.759	 - 	PET: 0.95 | HDPE: 0.90 | PP: 0.53 | PS: 0.71 | 
SVM [poly C=10000.0]: 0.696	 - 	PET: 0.95 | HDPE: 0.67 | PP: 0.53 | PS: 0.71 | 
SVM [rbf C=    0.1]: 0.481	 