In [1]:
import sys
sys.path.insert(0, '../scripts')

import torch

import meals as ml
from meal_classifiers import *
from unsupervised_helpers import *
import numpy as np
from preprocessing import read_excel_by_sheet
from sklearn.metrics.pairwise import cosine_similarity
time_threshold = 60
pellet_count_threshold = 2
from path import *

In [2]:
def good_meal_by_n_pellets(predictions, categories):
    category_counts = {3: {'total': 0, 'zeros': 0},
                    4: {'total': 0, 'zeros': 0},
                    5: {'total': 0, 'zeros': 0}}

    # Loop through the predictions and categories
    for pred, cat in zip(predictions, categories):
        if cat in category_counts:
            category_counts[cat]['total'] += 1
            if pred == 0:
                category_counts[cat]['zeros'] += 1

    temp_list = []
    for cat in category_counts:
        total = category_counts[cat]['total']
        zeros = category_counts[cat]['zeros']
        percentage = (zeros / total) * 100 if total > 0 else 0
        temp_list.append(percentage)
    return temp_list

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# model = CNNClassifier(num_classes=2, maxlen=4).to(device)
model = RNNClassifier(input_size=1, hidden_size=400, num_classes=2, num_layers=2).to(device)
model.load_state_dict(torch.load('../data/LSTM_from_CASK.pth'))

<All keys matched successfully>

In [4]:
male_data = []
female_data = []

for sheet in rev_male_sheets[:]:
    data = read_excel_by_sheet(sheet, rev_male_path)
    meals, meals_len = ml.extract_meals_for_model(data, time_threshold, pellet_count_threshold)
    preds = predict(model, meals)
    male_data.append(good_meal_by_n_pellets(preds, meals_len))

for sheet in rev_female_sheets:
    data = read_excel_by_sheet(sheet, rev_female_path)
    meals, meals_len = ml.extract_meals_for_model(data, time_threshold, pellet_count_threshold)
    preds = predict(model, meals)
    female_data.append(good_meal_by_n_pellets(preds, meals_len))

male_data = np.array(male_data)
female_data = np.array(female_data)

#### Cosine Similarity

In [5]:
similarity_matrix = cosine_similarity(male_data, female_data)

mean_similarity = np.mean(similarity_matrix)
max_similarity = np.max(similarity_matrix)
min_similarity = np.min(similarity_matrix)
variance_similarity = np.var(similarity_matrix)

print("Mean Similarity:", mean_similarity)
print("Max Similarity:", max_similarity)
print("Min Similarity:", min_similarity)
print("Variance of Similarity:", variance_similarity)

Mean Similarity: 0.9439150285195235
Max Similarity: 0.9996734811557442
Min Similarity: 0.7099739620540488
Variance of Similarity: 0.003944626953636261


#### Centroids Cosine Similarity

In [6]:
# Compute centroids
centroid_M1 = np.mean(male_data, axis=0)
centroid_M2 = np.mean(female_data, axis=0)

# Compute cosine similarity between centroids
centroid_similarity = cosine_similarity([centroid_M1], [centroid_M2])[0][0]

print("Cosine Similarity Between Group Centroids:", centroid_similarity)

Cosine Similarity Between Group Centroids: 0.9957519159019085


In [7]:
std_M1 = np.std(male_data, axis=0)
std_M2 = np.std(female_data, axis=0)
print(f'Control Group Mean: {centroid_M1}; STD: {std_M1}')
print(f'CASK Group Mean: {centroid_M2}; STD: {std_M2}')

Control Group Mean: [55.52307796 50.44314254 48.64749836]; STD: [ 7.71701028  9.95343587 15.99995533]
CASK Group Mean: [50.8160234  38.54519941 36.72416993]; STD: [11.12058559 13.96634426 16.16993982]


#### Wasserstein Distance

In [8]:
from scipy.stats import wasserstein_distance

# Compute pairwise Wasserstein distance for each dimension
# Combine data for normalization
combined_data = np.vstack([male_data, female_data])

# Compute mean and std for normalization (column-wise)
mean = np.mean(combined_data, axis=0)
std = np.std(combined_data, axis=0)

# Normalize both datasets
male_data_normalized = (male_data - mean) / std
female_data_normalized = (female_data - mean) / std

# Compute Wasserstein distance for normalized data (dimension-wise)
wasserstein_distances = [
    wasserstein_distance(male_data_normalized[:, i], female_data_normalized[:, i])
    for i in range(male_data_normalized.shape[1])
]
unnormalized_distance = [wasserstein_distance(male_data[:, i], female_data[:, i])
    for i in range(male_data.shape[1])]
unnormalized_mean = np.mean(unnormalized_distance)

mean_wasserstein_distance = np.mean(wasserstein_distances)

print("Wasserstein distances per dimension (normalized):", wasserstein_distances)
print("Mean Wasserstein Distance (normalized):", mean_wasserstein_distance)
print("Mean Wasserstein distance (unnormalized):", unnormalized_mean)

Wasserstein distances per dimension (normalized): [0.6233642055120513, 0.8950625616648947, 0.6956444714931735]
Mean Wasserstein Distance (normalized): 0.7380237462233733
Mean Wasserstein distance (unnormalized): 9.974332022209543


#### KL Divergence

In [9]:
import numpy as np
from scipy.stats import entropy

# Flatten matrices to compare all dimensions together
M1_flattened = male_data.flatten()
M2_flattened = female_data.flatten()

# Estimate probability distributions with histograms
bins = np.histogram_bin_edges(np.concatenate([M1_flattened, M2_flattened]), bins=30)
P, _ = np.histogram(M1_flattened, bins=bins, density=True)
Q, _ = np.histogram(M2_flattened, bins=bins, density=True)

# Add a small value to avoid division by zero
epsilon = 1e-10
P += epsilon
Q += epsilon

# Normalize to make valid probability distributions
P /= P.sum()
Q /= Q.sum()

# Compute KL divergence
kl_divergence = entropy(P, Q)  # D_KL(P || Q)
print(f"KL Divergence (male || female): {kl_divergence}")
kl_divergence = entropy(Q, P)  # D_KL(P || Q)
print(f"KL Divergence (female || male): {kl_divergence}")


KL Divergence (male || female): 3.9217157476253384
KL Divergence (female || male): 4.885500643455609


#### 3D Visualization

In [10]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

# Add control group points
fig.add_trace(go.Scatter3d(
    x=male_data[:, 0],
    y=male_data[:, 1],
    z=male_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.7),
    name='Male Group'
))

# Add female group points
fig.add_trace(go.Scatter3d(
    x=female_data[:, 0],
    y=female_data[:, 1],
    z=female_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='red', opacity=0.7),
    name='Female Group'
))

# Customize layout
fig.update_layout(
    title="3D Visualization of Male and Female Groups (Unnormalized)",
    scene=dict(
        xaxis=dict(title='Feature 1', backgroundcolor='rgba(211,211,211)'),
        yaxis=dict(title='Feature 2', backgroundcolor='rgba(211,211,211)'),
        zaxis=dict(title='Feature 3', backgroundcolor='rgba(211,211,211)')
    ),
    margin=dict(l=10, r=10, t=30, b=10),  # Reduce whitespace
    width=800,  # Adjust width of the plot
    height=600  # Adjust height of the plot
)

# Show the plot (interactive in Jupyter)
fig.show()

In [11]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

# Add control group points
fig.add_trace(go.Scatter3d(
    x=male_data_normalized[:, 0],
    y=male_data_normalized[:, 1],
    z=male_data_normalized[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.7),
    name='Male Group'
))

# Add female group points
fig.add_trace(go.Scatter3d(
    x=female_data_normalized[:, 0],
    y=female_data_normalized[:, 1],
    z=female_data_normalized[:, 2],
    mode='markers',
    marker=dict(size=5, color='red', opacity=0.7),
    name='Female Group'
))

# Customize layout
fig.update_layout(
    title="3D Visualization of Male and Female Groups (Normalized)",
    scene=dict(
        xaxis=dict(title='Feature 1', backgroundcolor='rgba(211,211,211)'),
        yaxis=dict(title='Feature 2', backgroundcolor='rgba(211,211,211)'),
        zaxis=dict(title='Feature 3', backgroundcolor='rgba(211,211,211)')
    ),
    margin=dict(l=10, r=10, t=30, b=10),  # Reduce whitespace
    width=800,  # Adjust width of the plot
    height=600  # Adjust height of the plot
)

# Show the plot (interactive in Jupyter)
fig.show()

#### Logistic Regression

In [12]:
X = np.vstack([male_data, female_data])
y = np.hstack([np.zeros(male_data.shape[0]), np.ones(female_data.shape[0])])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_proba = model.predict_proba(X_test)

# Compute log loss (cross-entropy loss)
loss = log_loss(y_test, y_pred_proba)
train_acc = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))
print(f"Log Loss: {loss:.4f}; Train Accuracy: {train_acc:.3f}; Test Accuracy: {test_acc:.3f}")

Log Loss: 0.7773; Train Accuracy: 0.750; Test Accuracy: 0.429


#### KMeans

In [15]:
kmeans = KMeans(n_clusters=2, random_state=42)
y_pred = kmeans.fit_predict(X)

# Align predicted labels with true labels (clusters are arbitrarily labeled)
if accuracy_score(y, y_pred) < accuracy_score(y, 1 - y_pred):
    y_pred = 1 - y_pred

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print(f"Clustering Accuracy: {accuracy:.4f}")

Clustering Accuracy: 0.7143
