# Identifying Nanoparticles Aggregate from Its Scattering Spectra with Machine Learning

In [1]:
from pandas import read_excel
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np
import os
import pandas as pd

In [2]:
# Load dataset
url = "../../data/processed/den/tanpabola3-1.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

print(dataset.head(5))

   lam_max1      csc_max1  lam_min       csc_min   lam_fwhm1         c_mid  \
0     408.0  1.601443e-14      430  1.309453e-14  452.547971  2.136383e-14   
1     408.0  1.601331e-14      430  1.309381e-14  452.547915  2.136334e-14   
2     408.0  1.601075e-14      430  1.309218e-14  452.547788  2.136220e-14   
3     408.0  1.600855e-14      430  1.309076e-14  452.547708  2.136121e-14   
4     408.0  1.600793e-14      430  1.309029e-14  452.547746  2.136089e-14   

   lam_max2      csc_max2   lam_fwhm2       fwhm  ...  posisi2  posisi3  \
0       462  2.963313e-14  471.424766  18.876794  ...        0        0   
1       462  2.963287e-14  471.424929  18.877014  ...        0        0   
2       462  2.963222e-14  471.425311  18.877523  ...        0        0   
3       462  2.963166e-14  471.425636  18.877928  ...        0        0   
4       462  2.963148e-14  471.425723  18.877976  ...        0        0   

   posisi4  posisi5  arah_k  arah_E  sb_putar  sudut1  sudut2  ket  
0        0 

In [3]:
# Cek kolom yang punya nilai yang sama
df_sama = dataset[dataset['lam_max1'] == dataset['lam_max2']]
print(len(df_sama[['lam_max1', 'lam_max2']]))

0


In [None]:
# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

## 0. Selecting Classification Model

In [None]:
# Spot check algorithms
models = {
    'LR' : make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='ovr')),
    'LDA' : LinearDiscriminantAnalysis(),
    'KNN' : KNeighborsClassifier(),
    'CART' : DecisionTreeClassifier(),
    'NB' : GaussianNB(),
    'SVM' : SVC(gamma='auto')
}

In [None]:
# Spot check algorithms
models = {
    'LR' : make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='ovr')),
    'LDA' : LinearDiscriminantAnalysis(),
    'KNN' : KNeighborsClassifier(),
    'CART' : DecisionTreeClassifier(),
    'NB' : GaussianNB(),
    'SVM' : make_pipeline(StandardScaler(), SVC(gamma='auto'))
}

In [None]:
# Spot check algorithms
models = {
    'LR' : make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', multi_class='ovr')),
    'LDA' : LinearDiscriminantAnalysis(),
    'KNN' : KNeighborsClassifier(),
    'CART' : DecisionTreeClassifier(),
    'NB' : GaussianNB(),
    'SVM' : make_pipeline(StandardScaler(), SVC(gamma='auto', kernel='rbf'))
}

In [None]:
#Cross-validation
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

# Evaluation storage
results = {name: {'accuracy': [], 'precision': []} for name in models}

# Training and evaluation
for name, model in models.items():
    for train_idx, test_idx in kf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average=None)

        results[name]['accuracy'].append(acc)
        results[name]['precision'].append(prec)

# Print results
for name, scores in results.items():
    print(f"Model: {name}")
    print(f"  Mean Accuracy: {np.mean(scores['accuracy']):.4f}")
    print(f"  Mean Precision: {np.mean(scores['precision']):.4f}")
    print()


## 1. Validation for `tanpabola3.xlsx` Data

In [None]:
# Load dataset
url = "../../data/processed/den/tanpabola3-1.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [None]:
class_0 = dataset[dataset['ket']==0].sample(n=189, random_state=42)
class_1 = dataset[dataset['ket']==1]
class_2 = dataset[dataset['ket']==2].sample(n=189, random_state=42)

balance_df = pd.concat([class_0, class_1, class_2])

balance_df = balance_df.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = balance_df
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [None]:
len(dataset)

In [None]:
dataset['ket'].value_counts()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=1,
    random_state=1
)

print(len(y_train))
print(len(y_valid))
print(len(y_valid)/(len(y_train) + len(y_valid)))

In [None]:
# Write results to a text file
with open("output_validation_tanpabola3_50.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file") 


### - Testing with Data `3bola.xlsx`

In [None]:
# Load dataset
url = "../../data/processed/den/3_sph.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
bola3 = read_excel(url, names=names, header=None)

# Define input and output data
X_bola3 = bola3.iloc[:,0:10].values
y_bola3 = bola3.iloc[:,20].values

print(len(X_bola3))

In [None]:
bola3['ket'].value_counts()

In [None]:
y_valid = y_bola3
X_valid = X_bola3

#y_valid = y_train
#X_valid = X_train
# Write results to a text file
with open("output_validation_bola3-100.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file")  

### - Testing with 20 data `bola3.xlsx` on each group 

In [None]:
y_valid = y_bola3
X_valid = X_bola3
# Write results to a text file
with open("output_validation_bola3_20-50.txt", "w") as f:
    for i in range(int(np.round(len(y_bola3)/50))):
        row = i*50
        itv = 20 + (i*50)
        X_valid = X_bola3[row:itv,:] 
        y_valid = y_bola3[row:itv]

        f.write(f"=========================( {i+1} )==========================\n")
        for name, model in models.items():
            model.fit(X_train,y_train)
            y_predict = model.predict(X_valid)
            report = classification_report(y_valid, y_predict, output_dict=True)
            cf = confusion_matrix(y_valid, y_predict)

            filtered_report = {k:v for k, v in report.items() if k not in ('accuracy','macro avg', 'weighted avg', 'micro avg')}
            df = pd.DataFrame(filtered_report).T
            acc = report['accuracy']
            f.write(f"Model: {name}\n")
            f.write(f"confusion matrix:\n {cf}\n")
            f.write(f"classification report: \n accuracy = {acc}\n {df[['precision', 'recall', 'f1-score']]}\n")
            f.write(f"------------------------------------------------------------\n")
print("Evaluation results exported to .txt file")

In [None]:
X_bola3_5070 = X_bola3[50:70,:]
y_bola3_5070 = y_bola3[50:70]

In [None]:
model1 = models['SVM'].fit(X_train,y_train)
model2 = models['CART'].fit(X_train,y_train)
y_predict1 = model1.predict(X_bola3_5070)
y_predict2 = model2.predict(X_bola3_5070)

In [None]:
print('Model: SVM')
print(f'actuals: {y_bola3_5070}')
print(f'predict: {y_predict1}')
print("================")
print('Model: CART')
print(f'actuals: {y_bola3_5070}')
print(f'predict: {y_predict2}')


In [None]:
columns = ['posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
#bola3[columns][70-7:70]
bola3[columns][50:64]

In [None]:
X_bola3_300320 = X_bola3[300:320,:]
y_bola3_300320 = y_bola3[300:320]

y_predict3 = model1.predict(X_bola3_300320)
y_predict4 = model2.predict(X_bola3_300320)

In [None]:
print('Model: SVM')
print(f'actuals: {y_bola3_300320}')
print(f'predict: {y_predict3}')
print("================")
print('Model: CART')
print(f'actuals: {y_bola3_300320}')
print(f'predict: {y_predict4}')

In [None]:
columns = ['posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
#bola3[columns][320-5:320]
bola3[columns][300:308]

In [None]:
import os

In [None]:
folder_path = "../../data/raw/3"

#select the files you want to plot
selected_files = ['10011_Y4_00.xlsx', '10011_Y4_15.xlsx', '10011_Y4_30.xlsx', '10011_Y4_45.xlsx',
                  '10011_Y4_60.xlsx', '10011_Y4_75.xlsx', '10011_Y4_90.xlsx']


lam = np.arange(350,851,2)

plt.figure(figsize=(8,5))

for file in selected_files:
    file_path = os.path.join(folder_path,file)

    df = read_excel(file_path, header=None)

    csca = df.iloc[0]

    if len(csca) != len(lam):
        print(f"Warning: {file} has {len(csca)} values but x has {len(lam)} points")
        continue
    
    plt.plot(lam, csca, label=file[-7:-5])

plt.xlabel('wavelength (nm)')
plt.ylim([0,1.5E-13])
plt.ylabel('Csca (m$^2$)')
plt.legend()
plt.tight_layout()
plt.show()
    

In [None]:
folder_path = "../../data/raw/3"

#select the files you want to plot
selected_files = ['00111_Y4_00.xlsx', '00111_Y4_15.xlsx', '00111_Y4_30.xlsx',
                  '00111_Y4_45.xlsx', '00111_Y4_60.xlsx', '00111_Y4_75.xlsx', '00111_Y4_90.xlsx',
                  '01011_X1_30.xlsx']


lam = np.arange(350,851,2)

plt.figure(figsize=(8,5))

for file in selected_files:
    file_path = os.path.join(folder_path,file)

    df = read_excel(file_path, header=None)

    csca = df.iloc[0]

    if len(csca) != len(lam):
        print(f"Warning: {file} has {len(csca)} values but x has {len(lam)} points")
        continue
    
    plt.plot(lam, csca, label=file[-7:-5])

plt.xlabel('wavelength (nm)')
plt.ylim([0,1.5E-13])
plt.ylabel('Csca (m$^2$)')
plt.legend()
plt.tight_layout()
plt.show()
    

In [None]:
folder_path = "../../data/raw/3"

#select the files you want to plot
selected_files = ['10101_X1_00.xlsx', '10101_X1_15.xlsx', '10101_X1_30.xlsx',
                  '10101_X1_45.xlsx', '10101_X1_60.xlsx', '10101_X1_75.xlsx', '10101_X1_90.xlsx']


lam = np.arange(350,851,2)

plt.figure(figsize=(8,5))

for file in selected_files:
    file_path = os.path.join(folder_path,file)

    df = read_excel(file_path, header=None)

    csca = df.iloc[0]

    if len(csca) != len(lam):
        print(f"Warning: {file} has {len(csca)} values but x has {len(lam)} points")
        continue
    
    plt.plot(lam, csca, label=file[-7:-5])

plt.xlabel('wavelength (nm)')
plt.ylim([0,1.5E-13])
plt.ylabel('Csca (m$^2$)')
plt.legend()
plt.tight_layout()
plt.show()
    

In [None]:
folder_path = "../../data/raw/3"

#select the files you want to plot
selected_files = ['10011_Y4_00.xlsx', '10011_Y4_15.xlsx', '10011_Y4_30.xlsx',
                  '10011_Y4_45.xlsx', '10011_Y4_90.xlsx']


lam = np.arange(350,851,2)

plt.figure(figsize=(8,5))

for file in selected_files:
    file_path = os.path.join(folder_path,file)

    df = read_excel(file_path, header=None)

    csca = df.iloc[0]

    if len(csca) != len(lam):
        print(f"Warning: {file} has {len(csca)} values but x has {len(lam)} points")
        continue
    
    plt.plot(lam, csca, label=file[-7:-5])

plt.xlabel('wavelength (nm)')
plt.ylim([0,1.5E-13])
plt.ylabel('Csca (m$^2$)')
plt.legend()
plt.tight_layout()
plt.show()
    

## 2. Validation for `bola12345.xlsx` Data

In [None]:
# Load dataset
url = "../../data/processed/den/bola12345-1.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [None]:
import seaborn as sns

In [None]:
sns.displot(dataset, x='fwhm', hue='ket', kind='kde')

In [None]:
dataset['ket'].value_counts()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.8,
    random_state=2
)

print(len(y_train))
print(len(y_valid))
print(len(y_valid)/(len(y_train) + len(y_valid)))

In [None]:
# Write results to a text file
with open("output_validation_bola12345.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file")

## 3. Using `2bola.xlsx` for Train Model to Predict Experiment Data

In [None]:
# Load dataset
url = "../../data/processed/den/2_sph.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
dataset = read_excel(url, names=names, header=None)

# Define input and output data
X = dataset.iloc[:,0:10].values
y = dataset.iloc[:,20].values

In [None]:
dataset['ket'].value_counts()

In [None]:
#test_size=1

X_train = X
y_train = y

In [None]:
# Load dataset
url = "../../data/processed/den/Exp_2bola.xlsx"
names = ['lam_max1', 'csc_max1', 'lam_min', 'csc_min','lam_fwhm1','c_mid','lam_max2','csc_max2','lam_fwhm2','fwhm','posisi1','posisi2','posisi3','posisi4','posisi5','arah_k','arah_E','sb_putar','sudut1','sudut2','ket'] 
exp = read_excel(url, names=names, header=None)

# Define input and output data
X_exp = exp.iloc[:,0:10].values
y_exp = exp.iloc[:,20].values

In [None]:
y_valid = y_exp
X_valid = X_exp
# Write results to a text file
with open("output_validation_experiment_2.txt", "w") as f:
    for name, model in models.items():
        model.fit(X_train,y_train)
        y_predict = model.predict(X_valid)
        report = classification_report(y_valid, y_predict)
        cf = confusion_matrix(y_valid, y_predict)

        f.write(f"Model: {name}\n")
        f.write(f"confusion matrix:\n {cf}\n")
        f.write(f"classification report: \n {report}\n")
        f.write(f"=====================================================\n")
print("Evaluation results exported to .txt file")