In [1]:
# @version : 05/2023
# @author : grafika_jati
# LiDAR cover contaminant classification using classic ML for 1D LiDAR aggregated data

# Split train and test from dataset taken by # B. Schlager, T. Goelles, S. Muckenhuber and D. Watzenig, "Contaminations on Lidar Sensor Covers: Performance Degradation Including Fault Detection and Modeling as Potential Applications," in IEEE Open Journal of Intelligent Transportation Systems, vol. 3, pp. 738-747, 2022, doi: 10.1109/OJITS.2022.3214094.

# Train : exp 1,2,3,4
# Test : exp 5

#import the libabries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import time as ti

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import pickle
from sklearn.metrics import classification_report # performance measurement , untuk menghasilkan confusion matrix


In [2]:
# Define a function to plot confusion matrix
def plot_confusion_matrix(model_name, cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig('1D_class_norm_per_exp_train_test_conf/1D_class_norm_per_exp_train_test-'+model_name+'.png')
    plt.clf()

In [3]:
# load all dataset
df = pd.read_csv("../get_1D_dataset_Norm_per_exp_train_test/dataset_1D_distribution_XYZ_RA_7_feature_exp.csv")

df.drop(df[df['class'] == 'dew'].index, inplace = True)
df.drop(df[df['class'] == 'dirt_05mm'].index, inplace = True)
df.drop(df[df['class'] == 'dirt_10mm'].index, inplace = True)
df.drop(df[df['class'] == 'dirt_15mm'].index, inplace = True)

df_add = df.dropna()

In [4]:
#get data class=ref without cover
# 80% for traning, 20% for testing
df_ref =df_add[df_add['class'] == 'clean']
df_ref_train = df_ref.sample(frac=0.80,random_state=1)
df_ref_test = df_ref.drop(df_ref_train.index)

#get data class=ref with cover
# 80% for traning, 20% for testing
df_ref_wh =df_add[df_add['class'] == 'cover']
df_ref_wh_train = df_ref_wh.sample(frac=0.80,random_state=1)
df_ref_wh_test = df_ref_wh.drop(df_ref_wh_train.index)

#get data class oil, foam, dirt, water from exp 1,2,3,4
#get training data
df_water_all=df_add[df_add['class'] == 'water']
df_water_1=df_water_all[df_water_all['exp'].isin([1])] 
df_water_2=df_water_all[df_water_all['exp'].isin([2])] 
df_water_3=df_water_all[df_water_all['exp'].isin([3])] 
df_water_4=df_water_all[df_water_all['exp'].isin([4])] 
df_water_5=df_water_all[df_water_all['exp'].isin([5])] 


df_oil_all=df_add[df_add['class'] == 'oil']
df_oil_1=df_oil_all[df_oil_all['exp'].isin([1])] 
df_oil_2=df_oil_all[df_oil_all['exp'].isin([2])] 
df_oil_3=df_oil_all[df_oil_all['exp'].isin([3])] 
df_oil_4=df_oil_all[df_oil_all['exp'].isin([4])] 
df_oil_5=df_oil_all[df_oil_all['exp'].isin([5])] 

df_foam_all=df_add[df_add['class'] == 'foam']
df_foam_1=df_foam_all[df_foam_all['exp'].isin([1])] 
df_foam_2=df_foam_all[df_foam_all['exp'].isin([2])] 
df_foam_3=df_foam_all[df_foam_all['exp'].isin([3])] 
df_foam_4=df_foam_all[df_foam_all['exp'].isin([4])] 
df_foam_5=df_foam_all[df_foam_all['exp'].isin([5])] 

df_dirt_all=df_add[df_add['class'] == 'dirt']
df_dirt_1=df_dirt_all[df_dirt_all['exp'].isin([1])] 
df_dirt_2=df_dirt_all[df_dirt_all['exp'].isin([2])] 
df_dirt_3=df_dirt_all[df_dirt_all['exp'].isin([3])] 
df_dirt_4=df_dirt_all[df_dirt_all['exp'].isin([4])] 
df_dirt_5=df_dirt_all[df_dirt_all['exp'].isin([5])] 


#skenario A 1234
#list dataframe you want to append
frame_A_train = [df_ref_train, df_ref_wh_train, df_water_1, df_water_2, df_water_3, df_water_4, df_oil_1, df_oil_2, df_oil_3, df_oil_4, df_foam_1, df_foam_2, df_foam_3, df_foam_4,df_dirt_1, df_dirt_2, df_dirt_3, df_dirt_4]

df_A_train = pd.DataFrame()

for df in frame_A_train:
    df_A_train = df_A_train.append(df)

frame_A_test = [df_ref_test, df_ref_wh_test, df_water_5, df_oil_5, df_foam_5, df_dirt_5]

df_A_test = pd.DataFrame()

for df in frame_A_test:
    df_A_test = df_A_test.append(df)


  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_train = df_A_train.append(df)
  df_A_test = df_A_test.append(df)
  df_A_test = df_A_test.append(df)
  df_A_test = df_A_test.append(df)
  df_A_test = df_A_test.append(df)
  df_A_test = df_A_test.append(df)
  df_A_test = df_A_test.append(df)


In [5]:
def normalization_per_feature(df_train, df_test):
    # constant parameter to normalized coordinate and attribute
    # That value is taken from Dataset exp 1,2,3,4 over all type of contamiant
    # do normalization for testing dataset using value from training dataset
    #  
    x_min = 8.136452 # minimum coordinate point in x-axis
    y_min = -0.313579 # minimum coordinate point in y-axis
    z_min = -0.260452 # minimum coordinate point in z-axis

    x_max = 8.992307 # maximum coordinate point in x-axis
    y_max = 0.499952 # maximum coordinate point in y-axis
    z_max = 0.145646 # maximum coordinate point in z-axis

    x_range = x_max-x_min
    y_range = y_max-y_min
    z_range = z_max-z_min

    a_min = 0.0 # minimun value of ambient
    r_min = 59.0 # minimun value of reflectivity

    a_max = 472.0 # maximum value of ambient
    r_max = 19528.0 # maximum value of reflectivity

    a_range = a_max-a_min # minimun value of ambient
    r_range = r_max-r_min # range value of reflectivity


    df_add_norm=df_train.copy()
    df_add_norm['count_x']=(df_add_norm['count_x']-df_train.min()[0])/(df_train.max()[0]-df_train.min()[0])
    df_add_norm['mean_x']=(df_add_norm['mean_x']-x_min)/x_range
    df_add_norm['std_x']=(df_add_norm['std_x']-x_min)/x_range
    df_add_norm['min_x']=(df_add_norm['min_x']-x_min)/x_range
    df_add_norm['per_25_x']=(df_add_norm['per_25_x']-x_min)/x_range
    df_add_norm['per_50_x']=(df_add_norm['per_50_x']-x_min)/x_range
    df_add_norm['per_75_x']=(df_add_norm['per_75_x']-x_min)/x_range
    df_add_norm['max_x']=(df_add_norm['max_x']-x_min)/x_range
    
    df_add_norm['mean_y']=(df_add_norm['mean_y']-y_min)/y_range
    df_add_norm['std_y']=(df_add_norm['std_y']-y_min)/y_range
    df_add_norm['min_y']=(df_add_norm['min_y']-y_min)/y_range
    df_add_norm['per_25_y']=(df_add_norm['per_25_y']-y_min)/y_range
    df_add_norm['per_50_y']=(df_add_norm['per_50_y']-y_min)/y_range
    df_add_norm['per_75_y']=(df_add_norm['per_75_y']-y_min)/y_range
    df_add_norm['max_y']=(df_add_norm['max_y']-y_min)/y_range

    df_add_norm['mean_z']=(df_add_norm['mean_z']-z_min)/z_range
    df_add_norm['std_z']=(df_add_norm['std_z']-z_min)/z_range
    df_add_norm['min_z']=(df_add_norm['min_z']-z_min)/z_range
    df_add_norm['per_25_z']=(df_add_norm['per_25_z']-z_min)/z_range
    df_add_norm['per_50_z']=(df_add_norm['per_50_z']-z_min)/z_range
    df_add_norm['per_75_z']=(df_add_norm['per_75_z']-z_min)/z_range
    df_add_norm['max_z']=(df_add_norm['max_z']-z_min)/z_range

    df_add_norm['mean_ref']=(df_add_norm['mean_ref']-r_min)/r_range
    df_add_norm['std_ref']=(df_add_norm['std_ref']-r_min)/r_range
    df_add_norm['min_ref']=(df_add_norm['min_ref']-r_min)/r_range
    df_add_norm['per_25_ref']=(df_add_norm['per_25_ref']-r_min)/r_range
    df_add_norm['per_50_ref']=(df_add_norm['per_50_ref']-r_min)/r_range
    df_add_norm['per_75_ref']=(df_add_norm['per_75_ref']-r_min)/r_range
    df_add_norm['max_ref']=(df_add_norm['max_ref']-r_min)/r_range

    df_add_norm['mean_amb']=(df_add_norm['mean_amb']-a_min)/a_range
    df_add_norm['std_amb']=(df_add_norm['std_amb']-a_min)/a_range
    df_add_norm['min_amb']=(df_add_norm['min_amb']-a_min)/a_range
    df_add_norm['per_25_amb']=(df_add_norm['per_25_amb']-a_min)/a_range
    df_add_norm['per_50_amb']=(df_add_norm['per_50_amb']-a_min)/a_range
    df_add_norm['per_75_amb']=(df_add_norm['per_75_amb']-a_min)/a_range
    df_add_norm['max_amb']=(df_add_norm['max_amb']-a_min)/a_range

    # Data test
    df_add_norm_test = df_test.copy()
    df_add_norm_test['count_x']=(df_add_norm_test['count_x']-df_train.min()[0])/(df_train.max()[0]-df_train.min()[0])
    df_add_norm_test['mean_x']=(df_add_norm_test['mean_x']-x_min)/x_range
    df_add_norm_test['std_x']=(df_add_norm_test['std_x']-x_min)/x_range
    df_add_norm_test['min_x']=(df_add_norm_test['min_x']-x_min)/x_range
    df_add_norm_test['per_25_x']=(df_add_norm_test['per_25_x']-x_min)/x_range
    df_add_norm_test['per_50_x']=(df_add_norm_test['per_50_x']-x_min)/x_range
    df_add_norm_test['per_75_x']=(df_add_norm_test['per_75_x']-x_min)/x_range
    df_add_norm_test['max_x']=(df_add_norm_test['max_x']-x_min)/x_range
    
    df_add_norm_test['mean_y']=(df_add_norm_test['mean_y']-y_min)/y_range
    df_add_norm_test['std_y']=(df_add_norm_test['std_y']-y_min)/y_range
    df_add_norm_test['min_y']=(df_add_norm_test['min_y']-y_min)/y_range
    df_add_norm_test['per_25_y']=(df_add_norm_test['per_25_y']-y_min)/y_range
    df_add_norm_test['per_50_y']=(df_add_norm_test['per_50_y']-y_min)/y_range
    df_add_norm_test['per_75_y']=(df_add_norm_test['per_75_y']-y_min)/y_range
    df_add_norm_test['max_y']=(df_add_norm_test['max_y']-y_min)/y_range

    df_add_norm_test['mean_z']=(df_add_norm_test['mean_z']-z_min)/z_range
    df_add_norm_test['std_z']=(df_add_norm_test['std_z']-z_min)/z_range
    df_add_norm_test['min_z']=(df_add_norm_test['min_z']-z_min)/z_range
    df_add_norm_test['per_25_z']=(df_add_norm_test['per_25_z']-z_min)/z_range
    df_add_norm_test['per_50_z']=(df_add_norm_test['per_50_z']-z_min)/z_range
    df_add_norm_test['per_75_z']=(df_add_norm_test['per_75_z']-z_min)/z_range
    df_add_norm_test['max_z']=(df_add_norm_test['max_z']-z_min)/z_range

    df_add_norm_test['mean_ref']=(df_add_norm_test['mean_ref']-r_min)/r_range
    df_add_norm_test['std_ref']=(df_add_norm_test['std_ref']-r_min)/r_range
    df_add_norm_test['min_ref']=(df_add_norm_test['min_ref']-r_min)/r_range
    df_add_norm_test['per_25_ref']=(df_add_norm_test['per_25_ref']-r_min)/r_range
    df_add_norm_test['per_50_ref']=(df_add_norm_test['per_50_ref']-r_min)/r_range
    df_add_norm_test['per_75_ref']=(df_add_norm_test['per_75_ref']-r_min)/r_range
    df_add_norm_test['max_ref']=(df_add_norm_test['max_ref']-r_min)/r_range

    df_add_norm_test['mean_amb']=(df_add_norm_test['mean_amb']-a_min)/a_range
    df_add_norm_test['std_amb']=(df_add_norm_test['std_amb']-a_min)/a_range
    df_add_norm_test['min_amb']=(df_add_norm_test['min_amb']-a_min)/a_range
    df_add_norm_test['per_25_amb']=(df_add_norm_test['per_25_amb']-a_min)/a_range
    df_add_norm_test['per_50_amb']=(df_add_norm_test['per_50_amb']-a_min)/a_range
    df_add_norm_test['per_75_amb']=(df_add_norm_test['per_75_amb']-a_min)/a_range
    df_add_norm_test['max_amb']=(df_add_norm_test['max_amb']-a_min)/a_range


    return df_add_norm, df_add_norm_test


In [6]:
df_A_train_clean =  df_A_train.drop('exp', axis=1)
df_A_test_clean =  df_A_test.drop('exp', axis=1)

df_A_train_clean = df_A_train_clean.sample(frac = 1,random_state=1)
df_A_test_clean = df_A_test_clean.sample(frac = 1,random_state=1)

df_A_train_X = df_A_train_clean.drop('class', axis=1)
df_A_train_Y = df_A_train_clean['class']

df_A_test_X = df_A_test_clean.drop('class', axis=1)
df_A_test_Y = df_A_test_clean['class']


# Do normalization based over all contaminant per feature
# Data train and data test
df_train_norm, df_test_norm = normalization_per_feature(df_A_train_X, df_A_test_X)


In [7]:
models = {
"logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
"mlp": MLPClassifier(),
"svm_rbf": SVC(kernel = "rbf", random_state = 0),
"naive_bayes": GaussianNB(),
"knn": KNeighborsClassifier(n_neighbors=5),
"decision_tree": DecisionTreeClassifier(),
"random_forest": RandomForestClassifier(n_estimators=10)
}

In [8]:
encoded_train_Y = df_A_train_Y.map({'clean':0, 'cover':1, 'dirt':2,'foam':3, 'oil':4, 'water':5})
encoded_test_Y = df_A_test_Y.map({'clean':0, 'cover':1, 'dirt':2,'foam':3, 'oil':4, 'water':5})

In [9]:
class_name = ['clean', 'cover',  'dirt', 'foam', 'oil', 'water']
labels= ['clean', 'cover','dirt', 'foam', 'oil', 'water']

print("Ready Training")

Ready Training


In [10]:
for model_name, model in models.items():
        # print(key)
        # print(model)

        print("[INFO] Performance Measurement using model - fold:", model_name)
        
        
        t = ti.time()
        model.fit(df_train_norm, encoded_train_Y)
        elapsed_training = ti.time() - t

        t = ti.time()    
        Y_pred_class = model.predict(df_test_norm)
        elapsed_inference = ti.time() - t
        elapsed_instance = elapsed_inference/(len(df_test_norm))



        Y_val_class = encoded_test_Y.values
        # print(Y_val_class)
        # print(Y_pred_class)

        #save model
        filename = "1D_class_norm_per_exp_train_test_model/model_"+model_name+".pickle"
        pickle.dump(model, open(filename, "wb"))


        # print(classification_report(Y_val_class,Y_pred_class, target_names=class_name))
        with open("1D_class_norm_per_exp_train_test_result.txt", "a") as f:
            print("model_name: ",model_name, file=f)
            print(classification_report(Y_val_class,Y_pred_class, target_names=class_name), file=f)
            
            print("elapsed_training: ", elapsed_training, file=f)
            # print("Inference time per Fold:", elapsed_inference, file=f)

            # print("Inference instance:", len(df_test_X), file=f)
            # print("Inference time per Instance:", elapsed_instance, file=f)
            print("elapsed_instance: ",elapsed_instance, file=f)

            if model_name=="random_forest" or model_name=="decision_tree" :
                # get importance
                importance = model.feature_importances_
                # summarize feature importance
                for iii,vvv in enumerate(importance):
                    print('Feature: %0d, Score: %.5f' % (iii,vvv), file=f)
            
            print("==============", file=f)

        
        # compute the confusion matrix
        confusion_mtx = confusion_matrix(Y_val_class, Y_pred_class) 

        # plot the confusion matrix
        plot_confusion_matrix(model_name,confusion_mtx,classes = class_name) 

[INFO] Performance Measurement using model - fold: logit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[INFO] Performance Measurement using model - fold: mlp




[INFO] Performance Measurement using model - fold: svm_rbf
[INFO] Performance Measurement using model - fold: naive_bayes
[INFO] Performance Measurement using model - fold: knn
[INFO] Performance Measurement using model - fold: decision_tree
[INFO] Performance Measurement using model - fold: random_forest


<Figure size 640x480 with 0 Axes>