In [140]:
import joblib
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
import pandas as pd

Load Data - Need to reset index for x_new_train because the index numbers are all messed up from being randomly pulled from a larger sample

In [141]:
spectra_train = joblib.load('cache/r20200406_234541_50.0sc_50.0sp_1_CPU/spectral/y_new_train.joblib')
spectra_test = joblib.load('cache/r20200406_234541_50.0sc_50.0sp_1_CPU/spectral/y_test.joblib')
labels_train = joblib.load('cache/r20200406_234541_50.0sc_50.0sp_1_CPU/spectral/x_new_train.joblib').reset_index()
labels_test = joblib.load('cache/r20200406_234541_50.0sc_50.0sp_1_CPU/spectral/x_test.joblib').reset_index()

Define functions used for building the model and parsing datasets

In [142]:
def preliminary_NN(xtrain, ytrain, xtest, ytest, num_catagories):
    # Xtrain and xtest should be spectra while ytrain and ytest should be specifications of materials 
    # num_catagories (int) should be the number of options availble for the catagorical variable being predicted (ie 3 for 
    # material type since there are 3 materials)
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(400,)))
    model.add(Dense(num_catagories, activation='softmax'))
    model.summary()
    
    model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])
    history = model.fit(xtrain, ytrain,
                    batch_size=100,
                    epochs=10,
                    verbose=2,
                    validation_data=(xtest, ytest))

In [143]:
def drop_indicies(df, column, condition_to_drop, update_existing_file = True):
    df_condition = df[column] == condition_to_drop 
    """
    This function takes a pandas df as input and drops a series of rows depending on a specified condition. For example, use this
    function to search through x_train and drop all rows where the material is not gold. 
    
    df - pandas dataframe 
    column - STR the column of the dataframe you want to use to determine if a row should be dropped 
    condition_to_drop - choose condition_to_drop such that the expression evalutes to true for the condition you want dropped 
    (ie if I want to only have Au samples, my column would be Material_Au and my condition would be 0, so that when the 
    condition would be true if the material was not gold)
    update_existing_file - BOOL, determines if the df that is inputted to this function is updated or if a new df with only the
    columns that aren't dropped by this function 
    """
    indicies_to_drop_list = []
    for row in df_condition.index:
        if df_condition.iloc[row] == True:
            indicies_to_drop_list.append(row)
    
    if update_existing_file == True:
        df.drop(indicies_to_drop_list, inplace=update_existing_file)
        return indicies_to_drop_list
    if update_existing_file == False:
        df_new = df.drop(indicies_to_drop_list, inplace=update_existing_file)
        return (df_new, indicies_to_drop_list)
        

Train model to predict the material of the particle corresponding to a spectrum 

In [144]:
labels_train_material = labels_train.drop(columns = ['log Area/Vol', 'ShortestDim', 'MiddleDim', 'LongDim', 'Geometry_TriangPrismIsosc',
                                                    'Geometry_parallelepiped', 'Geometry_sphere', 'Geometry_wire', 'index'] )
labels_test_material = labels_test.drop(columns = ['log Area/Vol', 'ShortestDim', 'MiddleDim', 'LongDim', 'Geometry_TriangPrismIsosc',
                                                    'Geometry_parallelepiped', 'Geometry_sphere', 'Geometry_wire', 'index'] )

In [145]:
labels_train_material_as_array = labels_train_material.to_numpy()
labels_test_material_as_array = labels_test_material.to_numpy()

In [146]:
preliminary_NN(spectra_train, labels_train_material_as_array, spectra_test, labels_test_material_as_array, 3)

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_29 (Dense)             (None, 512)               205312    
_________________________________________________________________
dense_30 (Dense)             (None, 3)                 1539      
Total params: 206,851
Trainable params: 206,851
Non-trainable params: 0
_________________________________________________________________
Train on 164052 samples, validate on 7813 samples
Epoch 1/10
164052/164052 - 6s - loss: 0.0197 - accuracy: 0.9992 - val_loss: 0.0689 - val_accuracy: 0.9994
Epoch 2/10
164052/164052 - 5s - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.0029 - val_accuracy: 0.9997
Epoch 3/10
164052/164052 - 5s - loss: 3.9521e-04 - accuracy: 1.0000 - val_loss: 3.8173e-08 - val_accuracy: 1.0000
Epoch 4/10
164052/164052 - 5s - loss: 9.0534e-09 - accuracy: 1.0000 - val_loss: 0.0064 - val_accuracy: 0.9996
Epoch 5/10
164052/16405

I got an accuracy of 1.0000

Train model to predict the shape of the particle corresponding to a spectrum 

In [154]:
labels_train_shape = labels_train.drop(columns = ['index','log Area/Vol', 'ShortestDim', 'MiddleDim', 'LongDim', 'Material_Au', 
                                                    'Material_SiN', 'Material_SiO2', 'index'] )
labels_test_shape = labels_test.drop(columns = ['index','log Area/Vol', 'ShortestDim', 'MiddleDim', 'LongDim', 'Material_Au',
                                                    'Material_SiN', 'Material_SiO2', 'index'] )

labels_train_shape_as_array = labels_train_shape.to_numpy()
labels_test_shape_as_array = labels_test_shape.to_numpy()

In [155]:
preliminary_NN(spectra_train, labels_train_shape_as_array, spectra_test, labels_test_shape_as_array, 4)

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 512)               205312    
_________________________________________________________________
dense_36 (Dense)             (None, 4)                 2052      
Total params: 207,364
Trainable params: 207,364
Non-trainable params: 0
_________________________________________________________________
Train on 164052 samples, validate on 7813 samples
Epoch 1/10
164052/164052 - 6s - loss: 0.8693 - accuracy: 0.5908 - val_loss: 1.0397 - val_accuracy: 0.4894
Epoch 2/10
164052/164052 - 5s - loss: 0.7536 - accuracy: 0.6385 - val_loss: 1.1194 - val_accuracy: 0.4748
Epoch 3/10
164052/164052 - 5s - loss: 0.7208 - accuracy: 0.6531 - val_loss: 1.0278 - val_accuracy: 0.5266
Epoch 4/10
164052/164052 - 5s - loss: 0.6997 - accuracy: 0.6619 - val_loss: 0.9537 - val_accuracy: 0.5531
Epoch 5/10
164052/164052 - 5s - los

I ran this a couple times with different set ups and my best model was around 57% accuracy

Train model to predict the shape of the particle corresponding to a spectrum for ONLY GOLD

In [149]:
labels_train_shape_Au = labels_train.drop(columns = ['index','log Area/Vol', 'ShortestDim', 'MiddleDim', 'LongDim', 
                                                    'Material_SiN', 'Material_SiO2'] )
labels_test_shape_Au = labels_test.drop(columns = ['index','log Area/Vol', 'ShortestDim', 'MiddleDim', 'LongDim', 
                                                    'Material_SiN', 'Material_SiO2'] )
indicies_to_drop_train_list = drop_indicies(labels_train_shape_Au, 'Material_Au', 0, True)
indicies_to_drop_test_list = drop_indicies(labels_test_shape_Au, 'Material_Au', 0, True)

In [150]:
labels_train_shape_Au.drop(columns = ['Material_Au'] , inplace=True)
labels_test_shape_Au.drop(columns = ['Material_Au'] , inplace=True)

spectra_train_df = pd.DataFrame(spectra_train)
spectra_test_df = pd.DataFrame(spectra_test)

spectra_train_df.drop(indicies_to_drop_train_list, inplace=True)
spectra_test_df.drop(indicies_to_drop_test_list, inplace=True)

In [151]:
labels_train_shape_Au_as_array = labels_train_shape_Au.to_numpy()
labels_test_shape_Au_as_array = labels_test_shape_Au.to_numpy()
spectra_train_shape_Au_as_array = spectra_train_df.to_numpy()
spectra_test_shape_Au_as_array= spectra_test_df.to_numpy()

In [152]:
preliminary_NN(spectra_train_shape_Au_as_array, labels_train_shape_Au_as_array, spectra_test_shape_Au_as_array, labels_test_shape_Au_as_array, 4)

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_33 (Dense)             (None, 512)               205312    
_________________________________________________________________
dense_34 (Dense)             (None, 4)                 2052      
Total params: 207,364
Trainable params: 207,364
Non-trainable params: 0
_________________________________________________________________
Train on 54950 samples, validate on 2604 samples
Epoch 1/10
54950/54950 - 2s - loss: 1.0649 - accuracy: 0.5310 - val_loss: 1.3469 - val_accuracy: 0.4581
Epoch 2/10
54950/54950 - 2s - loss: 0.9098 - accuracy: 0.5901 - val_loss: 1.3126 - val_accuracy: 0.4343
Epoch 3/10
54950/54950 - 2s - loss: 0.8674 - accuracy: 0.6084 - val_loss: 1.2200 - val_accuracy: 0.4528
Epoch 4/10
54950/54950 - 2s - loss: 0.8426 - accuracy: 0.6185 - val_loss: 1.3035 - val_accuracy: 0.4658
Epoch 5/10
54950/54950 - 2s - loss: 0.8250 -

This one I'm pretty surpised by - I tried a couple different models and I never got above 51%, I really thought this would be better than looking at all the materials