In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import os

In [9]:
hkr_wsl_data = "/home/hk-wsl/code/uom_explore/model_input/feature_matrix.csv"
data_path = hkr_wsl_data
df = pd.read_csv(data_path)
df.head(20)

Unnamed: 0,experiment_id,140,150,152,155,157,160,162,165,167,...,230,232,235,237,240,242,245,247,250,channel_id
0,02_17_16s1c0r0,0.289992,0.265276,0.308857,0.316474,0.301683,0.354073,0.308133,0.308787,0.253672,...,0.351504,0.319964,0.304643,0.30134,0.315273,0.300079,0.30616,0.297775,0.302454,0
1,02_17_51s1c1r0,0.190347,0.201155,0.261942,0.259362,0.239436,0.21484,0.243046,0.197277,0.206084,...,0.214454,0.215712,0.209748,0.206435,0.221081,0.231202,0.197481,0.207455,0.201596,1
2,02_18_26s1c2r0,0.099843,0.100285,0.123185,0.163675,0.353439,0.310796,0.265924,0.260067,0.109472,...,0.131489,0.084056,0.108258,0.112591,0.125032,0.115088,0.111438,0.109358,0.081325,2
3,02_19_01s1c3r0,0.290731,0.266106,0.35553,0.396649,0.635214,0.552152,0.323083,0.364723,0.30788,...,0.302109,0.309628,0.306289,0.30722,0.311936,0.309058,0.292207,0.280615,0.278412,3
4,02_19_36s1c4r0,6.039159,6.132384,6.641051,5.897604,5.541157,5.579751,5.564905,5.541405,5.909158,...,7.065215,6.754524,6.712956,6.57087,6.562759,6.455663,6.417492,6.324318,6.198248,4
5,02_20_11s1c0r1,0.180546,0.16752,0.335601,0.469539,0.44098,0.371804,0.293083,0.195628,0.190061,...,0.196156,0.176294,0.181031,0.18244,0.191652,0.167608,0.1668,0.172777,0.168669,0
6,02_20_47s1c1r1,0.172724,0.183373,0.290345,0.268579,0.246678,0.408043,0.347959,0.257394,0.24567,...,0.214077,0.213215,0.207059,0.207254,0.180134,0.176348,0.181485,0.183667,0.169821,1
7,02_21_22s1c2r1,0.075445,0.093645,0.216286,0.235086,0.210154,0.363281,0.302005,0.212478,0.181599,...,0.121652,0.110778,0.09481,0.103905,0.107537,0.106934,0.121252,0.082124,0.080748,2
8,02_21_57s1c3r1,0.217517,0.250882,0.309497,0.416828,0.347345,0.32725,0.31784,0.277075,0.272408,...,0.24254,0.250049,0.24995,0.251612,0.26233,0.253037,0.237478,0.236406,0.220911,3
9,02_22_32s1c4r1,5.381885,5.568157,5.964051,5.982613,7.865407,7.211391,5.463492,5.54672,5.584767,...,6.233153,6.065726,6.061627,5.943027,5.738385,5.609565,5.612581,5.636566,5.610314,4


In [22]:
# Filter the dataframe for channel_id = 4
target_channel = 4
filtered_df = df[df['channel_id'] == target_channel]

# Drop non-numeric columns for plotting
filtered_df_numeric = filtered_df.drop(columns=['experiment_id', 'channel_id'])

# Transpose the dataframe to plot it
filtered_df_numeric_T = filtered_df_numeric.T

# Create a line plot using Plotly
fig = go.Figure()

for i, col in enumerate(filtered_df_numeric_T.columns):
    fig.add_trace(go.Scatter(
        x=filtered_df_numeric_T.index,
        y=filtered_df_numeric_T[col],
        mode='markers',
        name=f'Experiment {filtered_df["experiment_id"].iloc[i]}'
    ))

fig.update_layout(
    title=f'Sensor Values for Channel ID {target_channel}',
    xaxis_title='Setting',
    yaxis_title='Sensor Value',
    legend_title='Experiment',
    template='ggplot2',
    # xaxis=dict(showgrid=False),
    # yaxis=dict(showgrid=False)
)

fig.show()


# Paneled Charts

In [27]:
import plotly.subplots as sp
# Create a panel of 5 charts for channel_ids 0 to 4
fig = sp.make_subplots(rows=3, cols=2, shared_xaxes=True, vertical_spacing=0.05, subplot_titles=[f'Channel ID {i}' for i in range(5)])

for target_channel in range(5):
    # Filter the dataframe for the specified channel_id
    filtered_df = df[df['channel_id'] == target_channel]

    # Drop non-numeric columns for plotting
    filtered_df_numeric = filtered_df.drop(columns=['experiment_id', 'channel_id'])

    # Transpose the dataframe to plot it
    filtered_df_numeric_T = filtered_df_numeric.T

    # Determine the subplot row and column
    row = (target_channel // 2) + 1
    col = (target_channel % 2) + 1

    # Add scatter plots to the subplot
    for i, col_data in enumerate(filtered_df_numeric_T.columns):
        fig.add_trace(go.Scatter(
            x=filtered_df_numeric_T.index,
            y=filtered_df_numeric_T[col_data],
            mode='markers',
            name=f'Experiment {filtered_df["experiment_id"].iloc[i]}'
        ), row=row, col=col)

# Update layout
fig.update_layout(
    height=1500,
    title_text='Sensor Values for Channel IDs 0 to 4',
    showlegend=False,  # Hide the legend to avoid clutter
    template='ggplot2'
)

# PCA

In [43]:

# Filter the dataframe for the desired channel IDs (0 to 4 in this case)
filtered_df = df[df['channel_id'].isin([0, 1, 2, 3, 4])]

# Drop non-numeric columns for PCA
df_numeric = filtered_df.drop(columns=['experiment_id', 'channel_id'])
# Standardize the data before PCA
scaler = StandardScaler()
df_numeric_scaled = scaler.fit_transform(df_numeric)

# Perform PCA with 3 components
n_components = 5
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(df_numeric_scaled)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(n_components)])
pca_df['channel_id'] = filtered_df['channel_id'].values
pca_df['experiment_id'] = filtered_df['experiment_id'].values

# save pca to csv
pca_df.to_csv('feature_pca.csv', index=False)

## Plotting PCA

In [30]:
# perform PCA on the data
from sklearn.decomposition import PCA

# Filter the dataframe for the desired channel IDs (0 to 4 in this case)
filtered_df = df[df['channel_id'].isin([0, 1, 2, 3, 4])]

# Drop non-numeric columns for PCA
df_numeric = filtered_df.drop(columns=['experiment_id', 'channel_id'])

# Perform PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_numeric)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['channel_id'] = filtered_df['channel_id'].values

# Visualize the PCA result using Plotly
fig = px.scatter(pca_df, x='PC1', y='PC2', color='channel_id', title='PCA Analysis of Sensor Values')
fig.show()

In [31]:
from sklearn.preprocessing import StandardScaler

# Standardize the data before PCA
scaler = StandardScaler()
df_numeric_scaled = scaler.fit_transform(df_numeric)

# Perform PCA with 3 components
pca = PCA(n_components=3)
principal_components = pca.fit_transform(df_numeric_scaled)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])
pca_df['channel_id'] = filtered_df['channel_id'].values

# Visualize the PCA result using Plotly for the first three principal components
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='channel_id', title='PCA Analysis of Sensor Values')
fig.show()


# tsne

In [32]:
from sklearn.manifold import TSNE
# Filter the dataframe for the desired channel IDs (0 to 4 in this case)
filtered_df = df[df['channel_id'].isin([0, 1, 2, 3, 4])]

# Drop non-numeric columns for t-SNE
df_numeric = filtered_df.drop(columns=['experiment_id', 'channel_id'])

# Standardize the data before t-SNE
scaler = StandardScaler()
df_numeric_scaled = scaler.fit_transform(df_numeric)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_components = tsne.fit_transform(df_numeric_scaled)

# Create a DataFrame with the t-SNE components
tsne_df = pd.DataFrame(data=tsne_components, columns=['TSNE1', 'TSNE2'])
tsne_df['channel_id'] = filtered_df['channel_id'].values

# Visualize the t-SNE result using Plotly
fig = px.scatter(tsne_df, x='TSNE1', y='TSNE2', color='channel_id', title='t-SNE Analysis of Sensor Values')
fig.show()