In [1]:
import sys
sys.path.insert(0, '../scripts')

import torch

import meals as ml
from meal_classifiers import *
from unsupervised_helpers import *
import numpy as np
from preprocessing import read_excel_by_sheet
from sklearn.metrics.pairwise import cosine_similarity
time_threshold = 60
pellet_count_threshold = 2
from path import *

In [2]:
def good_meal_by_n_pellets(predictions, categories):
    category_counts = {3: {'total': 0, 'zeros': 0},
                    4: {'total': 0, 'zeros': 0},
                    5: {'total': 0, 'zeros': 0}}

    # Loop through the predictions and categories
    for pred, cat in zip(predictions, categories):
        if cat in category_counts:
            category_counts[cat]['total'] += 1
            if pred == 0:
                category_counts[cat]['zeros'] += 1

    temp_list = []
    for cat in category_counts:
        total = category_counts[cat]['total']
        zeros = category_counts[cat]['zeros']
        percentage = (zeros / total) * 100 if total > 0 else 0
        temp_list.append(percentage)
    return temp_list

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = CNNClassifier(num_classes=2, maxlen=4).to(device)
model = RNNClassifier(input_size=1, hidden_size=400, num_classes=2, num_layers=2).to(device)
model.load_state_dict(torch.load('./LSTM_from_CASK.pth'))

<All keys matched successfully>

In [4]:
ctrl_data = []
cask_data = []

for sheet in rev_ctrl_sheets[:]:
    data = read_excel_by_sheet(sheet, rev_ctrl_path)
    meals, meals_len = ml.extract_meals_for_model(data, time_threshold, pellet_count_threshold)
    preds = predict(model, meals)
    ctrl_data.append(good_meal_by_n_pellets(preds, meals_len))

for sheet in rev_cask_sheets:
    data = read_excel_by_sheet(sheet, rev_cask_path)
    meals, meals_len = ml.extract_meals_for_model(data, time_threshold, pellet_count_threshold)
    preds = predict(model, meals)
    cask_data.append(good_meal_by_n_pellets(preds, meals_len))

ctrl_data = np.array(ctrl_data)
cask_data = np.array(cask_data)

#### Cosine Similarity

In [5]:
similarity_matrix = cosine_similarity(ctrl_data, cask_data)

mean_similarity = np.mean(similarity_matrix)
max_similarity = np.max(similarity_matrix)
min_similarity = np.min(similarity_matrix)
variance_similarity = np.var(similarity_matrix)

print("Mean Similarity:", mean_similarity)
print("Max Similarity:", max_similarity)
print("Min Similarity:", min_similarity)
print("Variance of Similarity:", variance_similarity)

Mean Similarity: 0.9707122768469617
Max Similarity: 0.9999799078019359
Min Similarity: 0.7226637203153151
Variance of Similarity: 0.001962304382042059


#### Centroids Cosine Similarity

In [6]:
# Compute centroids
centroid_M1 = np.mean(ctrl_data, axis=0)
centroid_M2 = np.mean(cask_data, axis=0)

# Compute cosine similarity between centroids
centroid_similarity = cosine_similarity([centroid_M1], [centroid_M2])[0][0]

print("Cosine Similarity Between Group Centroids:", centroid_similarity)

Cosine Similarity Between Group Centroids: 0.9999996586557744


In [7]:
std_M1 = np.std(ctrl_data, axis=0)
std_M2 = np.std(cask_data, axis=0)
print(f'Control Group Mean: {centroid_M1}; STD: {std_M1}')
print(f'CASK Group Mean: {centroid_M2}; STD: {std_M2}')

Control Group Mean: [54.71109913 52.89117039 53.03310483]; STD: [ 8.3805239  11.39277633 14.44403729]
CASK Group Mean: [48.61574713 47.05912941 47.090181  ]; STD: [10.82321876 11.04119935 15.44495944]


#### Wasserstein Distance

In [8]:
from scipy.stats import wasserstein_distance

# Compute pairwise Wasserstein distance for each dimension
# Combine data for normalization
combined_data = np.vstack([ctrl_data, cask_data])

# Compute mean and std for normalization (column-wise)
mean = np.mean(combined_data, axis=0)
std = np.std(combined_data, axis=0)

# Normalize both datasets
ctrl_data_normalized = (ctrl_data - mean) / std
cask_data_normalized = (cask_data - mean) / std

# Compute Wasserstein distance for normalized data (dimension-wise)
wasserstein_distances = [
    wasserstein_distance(ctrl_data_normalized[:, i], cask_data_normalized[:, i])
    for i in range(ctrl_data_normalized.shape[1])
]
unnormalized_distance = [wasserstein_distance(ctrl_data[:, i], cask_data[:, i])
    for i in range(ctrl_data.shape[1])]
unnormalized_mean = np.mean(unnormalized_distance)

mean_wasserstein_distance = np.mean(wasserstein_distances)

print("Wasserstein distances per dimension (normalized):", wasserstein_distances)
print("Mean Wasserstein Distance (normalized):", mean_wasserstein_distance)
print("Mean Wasserstein distance (unnormalized):", unnormalized_mean)

Wasserstein distances per dimension (normalized): [0.6048234134884379, 0.502766919167065, 0.408190295638837]
Mean Wasserstein Distance (normalized): 0.5052602094314466
Mean Wasserstein distance (unnormalized): 6.046053759558366


#### KL Divergence

In [9]:
import numpy as np
from scipy.stats import entropy

# Flatten matrices to compare all dimensions together
M1_flattened = ctrl_data.flatten()
M2_flattened = cask_data.flatten()

# Estimate probability distributions with histograms
bins = np.histogram_bin_edges(np.concatenate([M1_flattened, M2_flattened]), bins=30)
P, _ = np.histogram(M1_flattened, bins=bins, density=True)
Q, _ = np.histogram(M2_flattened, bins=bins, density=True)

# Add a small value to avoid division by zero
epsilon = 1e-10
P += epsilon
Q += epsilon

# Normalize to make valid probability distributions
P /= P.sum()
Q /= Q.sum()

# Compute KL divergence
kl_divergence = entropy(P, Q)  # D_KL(P || Q)
print(f"KL Divergence (ctrl || cask): {kl_divergence}")
kl_divergence = entropy(Q, P)  # D_KL(P || Q)
print(f"KL Divergence (cask || ctrl): {kl_divergence}")


KL Divergence (ctrl || cask): 1.6059875915650688
KL Divergence (cask || ctrl): 2.20475664166971


#### 3D Visualization

In [10]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

# Add control group points
fig.add_trace(go.Scatter3d(
    x=ctrl_data[:, 0],
    y=ctrl_data[:, 1],
    z=ctrl_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.7),
    name='Control Group'
))

# Add cask group points
fig.add_trace(go.Scatter3d(
    x=cask_data[:, 0],
    y=cask_data[:, 1],
    z=cask_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='red', opacity=0.7),
    name='Cask Group'
))

# Customize layout
fig.update_layout(
    title="3D Visualization of Control and Cask Groups (Unnormalized)",
    scene=dict(
        xaxis=dict(title='Feature 1', backgroundcolor='rgba(211,211,211)'),
        yaxis=dict(title='Feature 2', backgroundcolor='rgba(211,211,211)'),
        zaxis=dict(title='Feature 3', backgroundcolor='rgba(211,211,211)')
    ),
    margin=dict(l=10, r=10, t=30, b=10),  # Reduce whitespace
    width=800,  # Adjust width of the plot
    height=600  # Adjust height of the plot
)

# Show the plot (interactive in Jupyter)
fig.show()

In [11]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

# Add control group points
fig.add_trace(go.Scatter3d(
    x=ctrl_data_normalized[:, 0],
    y=ctrl_data_normalized[:, 1],
    z=ctrl_data_normalized[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.7),
    name='Control Group'
))

# Add cask group points
fig.add_trace(go.Scatter3d(
    x=cask_data_normalized[:, 0],
    y=cask_data_normalized[:, 1],
    z=cask_data_normalized[:, 2],
    mode='markers',
    marker=dict(size=5, color='red', opacity=0.7),
    name='Cask Group'
))

# Customize layout
fig.update_layout(
    title="3D Visualization of Control and Cask Groups (Normalized)",
    scene=dict(
        xaxis=dict(title='Feature 1', backgroundcolor='rgba(211,211,211)'),
        yaxis=dict(title='Feature 2', backgroundcolor='rgba(211,211,211)'),
        zaxis=dict(title='Feature 3', backgroundcolor='rgba(211,211,211)')
    ),
    margin=dict(l=10, r=10, t=30, b=10),  # Reduce whitespace
    width=800,  # Adjust width of the plot
    height=600  # Adjust height of the plot
)

# Show the plot (interactive in Jupyter)
fig.show()

#### Logistic Regression

In [12]:
X = np.vstack([ctrl_data, cask_data])
y = np.hstack([np.zeros(ctrl_data.shape[0]), np.ones(cask_data.shape[0])])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_proba = model.predict_proba(X_test)

# Compute log loss (cross-entropy loss)
loss = log_loss(y_test, y_pred_proba)
train_acc = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))
print(f"Log Loss: {loss:.4f}; Train Accuracy: {train_acc:.3f}; Test Accuracy: {test_acc:.3f}")

Log Loss: 0.6771; Train Accuracy: 0.625; Test Accuracy: 0.545


#### KMeans

In [15]:
kmeans = KMeans(n_clusters=2, random_state=42)
y_pred = kmeans.fit_predict(X)

# Align predicted labels with true labels (clusters are arbitrarily labeled)
if accuracy_score(y, y_pred) < accuracy_score(y, 1 - y_pred):
    y_pred = 1 - y_pred

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print(f"Clustering Accuracy: {accuracy:.4f}")

Clustering Accuracy: 0.6078
