# Important Frequencies Selection

## Setup

First, we need to install the necessary libraries. Run the following cell to install them.

In [9]:
%pip install torch torchvision torchaudio
%pip install pandas scikit-learn
%pip install wandb onnx -Uq
%pip install joblib

Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Import Libraries and seed
Import the necessary libraries for data processing, model building, training, and evaluation. Adding a seed ensures reproducibility by making sure that the random number generation is consistent across different runs.

In [32]:
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


import wandb

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    return seed

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


Using device: cpu


In [12]:
def load_data_from_directory(input_path):
    data_frames = []
    for file in os.listdir(input_path):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(input_path, file), delimiter=';', header=0)
            data_frames.append(df)
    data = pd.concat(data_frames, ignore_index=True)

    print(data)
    print(data.shape)

    return data

## Preprocessing Data
Define a function to preprocess the data. This includes encoding categorical labels and standardizing the features.

In [13]:
def calculate_averages_and_dispersion(data, data_percentage):
    df = data
    results = []
    for (sample, freq), group in df.groupby(['Sample', 'Frequency (GHz)']):
        window_size = max(1, int(len(group) * data_percentage / 100))
        # print(f"Processing sample: {sample}, frequency: {freq} with window size: {window_size}")
        for start in range(0, len(group), window_size):
            window_data = group.iloc[start:start + window_size]
            mean_values = window_data[['LG (mV)', 'HG (mV)']].mean()
            std_deviation_values = window_data[['LG (mV)', 'HG (mV)']].std()
            results.append({
                'Frequency (GHz)': freq,
                'LG (mV) mean': mean_values['LG (mV)'],
                'HG (mV) mean': mean_values['HG (mV)'],
                'LG (mV) std deviation': std_deviation_values['LG (mV)'],
                'HG (mV) std deviation': std_deviation_values['HG (mV)'],
                # 'Thickness (mm)': window_data['Thickness (mm)'].iloc[0], ## COMMENT
                'Sample': sample,
            })
    results_df = pd.DataFrame(results)
    # results_df.to_csv(output_file, sep=';', index=False)
    # print(f"Processed {input_file} and saved to {output_file}")
    # print(results_df)
    return results_df

In [14]:
def split_label(df):

    # Assuming the last column is the target
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    # Encode the target variable if it's categorical
    if y.dtype == 'object':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # le is the fitted LabelEncoder - Saving Encoder
    joblib.dump(le, 'label_encoder.pkl')

    # Normalization TBD
    # # Standardize the features
    # scaler = StandardScaler()
    # X = scaler.fit_transform(X)

    return X, y

## Pivoting Frequency values to columns

In [None]:
def freq_as_variable(df, data_percentage):
    '''Modify df to have Frequency values (100,110,120 and so on) as and input variables in the columns'''

    # Remove Thickness column
    if 'Thickness (mm)' in df.columns:
        df = df.drop(columns=['Thickness (mm)'])

    # 1s window_size 100/27s = 3.7% of the data is used for each window
    df_window = calculate_averages_and_dispersion(df, data_percentage = 3.7) 

    print(df_window)

    # Add a unique identifier column to avoid duplicate entries in the index
    df_window['unique_id'] = df_window.groupby(['Sample', 'Frequency (GHz)']).cumcount()

    # Pivot the DataFrame to wide format
    df_pivot = df_window.pivot(index=['Sample', 'unique_id'], columns='Frequency (GHz)')

    # Flatten the MultiIndex columns
    df_pivot.columns = [' '.join(map(str, col)) for col in df_pivot.columns]

    # Drop columns with all NaN values
    df_pivot = df_pivot.dropna(axis=1, how='all')

    # Reset index to make 'Sample' and 'unique_id' columns again
    df_pivot = df_pivot.reset_index()

    # Optional - Sort the columns if needed
    df_pivot = df_pivot.reindex(sorted(df_pivot.columns), axis=1)

    # Remove 'unique_id' column
    df_pivot = df_pivot.drop(columns=['unique_id'])

    return df_pivot

In [None]:
# Load the data from the directory
input_path = 'C:/Users/Danim/Documents/GitHub/PIC-PAPER-01/data/experiment_1_plastics/processed_27s/training_file/test/'
df = load_data_from_directory(input_path)

# Introduce Frequency values as input variables
df = freq_as_variable(df, data_percentage=3.7)
# print(f'data : {df}')

# Save the processed data to a CSV file
output_path = 'C:/Users/Danim/Documents/GitHub/PIC-PAPER-01/data/experiment_1_plastics/processed_27s/training_file/test.csv'
df.to_csv(output_path, sep = ';', index=False)

# Split the data into X and y
X, y = split_label(df)
# print(f'data : {df}')
print(f'X : {X}')
print(f'y : {y}')

# # Convert to PyTorch tensors
# X_groups = torch.tensor(X_groups, dtype=torch.float32)
# y_groups = torch.tensor(y_groups, dtype=torch.long)

        Sample  Frequency (GHz)     LG (mV)    HG (mV)  Thickness (mm)
0           A1            100.0   -7.080942  -0.854611             0.2
1           A1            100.0   67.024785   0.244141             0.2
2           A1            100.0  124.893178  -1.098776             0.2
3           A1            100.0   91.075571   0.000000             0.2
4           A1            100.0   48.956174   0.122094             0.2
...        ...              ...         ...        ...             ...
2420620    REF            600.0    0.366256  16.237333             0.0
2420621    REF            600.0    0.000000  -7.080942             0.0
2420622    REF            600.0   -0.244170  15.260652             0.0
2420623    REF            600.0    0.366256  20.021975             0.0
2420624    REF            600.0    0.122085  13.185203             0.0

[2420625 rows x 5 columns]
(2420625, 5)
       Frequency (GHz)  LG (mV) mean  HG (mV) mean  LG (mV) std deviation  \
0                100.0     54.

## Login in Weights & Biases

In [26]:
###
### SKIP FOR NOW ###
###

%env WANDB_API_KEY=94b4debef3cc9601df4d91995649548f8ab3a097
wandb.login()

# wandb.init(project='PIC-PAPER-01', entity='UC3M', name='RandomForest')
# wandb.watch(rf_model)

env: WANDB_API_KEY=94b4debef3cc9601df4d91995649548f8ab3a097


True

## Prepare Training Data

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Train Model

In [None]:
# Model Parameters
input_size = X.shape[1]
output_size = len(np.unique(y))
seed = set_seed(42)
# print(f'input_size: {input_size}, output_size: {output_size}')
# print(X_train, y_train)

# Define Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=seed)

# # Define Naive-Bayes model
# nb_model = GaussianNB()

# # Define Logistic Regression model
# lr_model = LogisticRegression(max_iter=1000, random_state = seed)


# Train model using wandb for metrics
# TBD 

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Create a DataFrame to compare
df_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df_comparison.head(10))

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

# Classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# Feature importance


# Log metrics wandb
# wandb.log({"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1})






   Actual  Predicted
0       5          5
1      11         11
2       6          6
3      13         13
4      14         14
5       2          2
6       4          4
7       4          4
8       1          1
9       2          2
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00        10
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         4
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         3
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         4
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         4
          12     

## Load and preprocess Other Test Data (Experiment 2)

In [None]:
#TBD

In [None]:
# ## Confusion Matrix
# conf_matrix = confusion_matrix(le.inverse_transform(y_test.cpu().numpy()), predicted_labels, labels=le.classes_)

# # Confusion matrix with matplotlib
# plt.figure(figsize=(12, 10))
# plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.colorbar()
# tick_marks = np.arange(len(le.classes_))
# plt.xticks(tick_marks, le.classes_, rotation=45)
# plt.yticks(tick_marks, le.classes_)

# # Normalize the confusion matrix
# conf_matrix_normalized = np.nan_to_num(conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis])

# # Print the normalized values inside the matrix as percentages
# thresh = conf_matrix.max() / 2.
# for i, j in np.ndindex(conf_matrix.shape):
#     plt.text(j, i, f"{conf_matrix[i, j]}\n ({conf_matrix_normalized[i, j] * 100:.1f}%)",
#              horizontalalignment="center",
#              verticalalignment="center",
#              fontsize=9,
#              color="white" if conf_matrix[i, j] > thresh else "black")

# plt.ylabel('True label')
# plt.xlabel('Predicted label')
# plt.tight_layout()

## Feature Importance TBD

## Labels

| Original Label | Encoded Value |
|----------------|---------------|
| A1             | 0             |
| B1             | 1             |
| C1             | 2             |
| D1             | 3             |
| E1             | 4             |
| F1             | 5             |
| G1             | 6             |
| H1             | 7             |
| I1             | 8             |
| J1             | 9             |
| K1             | 10            |
| L1             | 11            |
| M1             | 12            |
| N1             | 13            |
| REF            | 14            |