In [1]:
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import torch 
import torch.nn as nn 

In [2]:
from TextAutoEncoder import TextAutoEncoder
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
AE_MODEL_SAVE_PATH = "C:/Users/Korhan/Desktop/workspace/vsCodeWorkspace/Python_Workspace/mental_health_sentiment_analysis/autoencoder.pt"

model = TextAutoEncoder(input_dim=300, latent_dim=16).to(device=DEVICE)
model.load_state_dict(torch.load(AE_MODEL_SAVE_PATH, map_location=torch.device(DEVICE)))

<All keys matched successfully>

### Calculating the Latent Vectors : 

In [3]:
from TextDataset import TextDataset
from torch.utils.data import Dataset, DataLoader
BATCH_SIZE = 16

X_train_embeddings = np.load("X_train_embeddings.npy")
y_train_encoded = np.load("y_train_encoded.npy")
X_test_embeddings = np.load("X_test_embeddings.npy")
y_test_encoded = np.load("y_test_encoded.npy")

X_train_embeddings = np.array(X_train_embeddings)
X_test_embeddings = np.array(X_test_embeddings)

y_train_encoded = np.array(y_train_encoded)
y_test_encoded = np.array(y_test_encoded)

y_train_encoded = np.reshape(y_train_encoded, (-1,))
y_test_encoded = np.reshape(y_test_encoded, (-1,))


train_dataset = TextDataset(X_train_embeddings, y_train_encoded)
test_dataset = TextDataset(X_test_embeddings, y_test_encoded)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [4]:
model.eval()
latent_embeddings = []
with torch.no_grad():
    for X, y in train_dataloader:
        X, y = X.to(DEVICE), y.to(DEVICE)
        y_pred, latent_embedding_batch = model(X)
        latent_embeddings.append(latent_embedding_batch)

In [5]:
latent_embeddings = torch.cat(latent_embeddings, dim=0)

print("Final latent embeddings shape:", latent_embeddings.shape)

Final latent embeddings shape: torch.Size([84868, 16])


In [6]:
np.save("latent_embeddings.npy", latent_embeddings.cpu().numpy())

### Fitting Mog to the Latent Descriptors : 

In [7]:
from sklearn.mixture import GaussianMixture

latent_embeddings_np = latent_embeddings.cpu().numpy()

num_components = 7

from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture

param_grid = {
    'n_components': [5, 6, 7],
    'covariance_type': ['full', 'tied', 'diag', 'spherical'],
    'max_iter': [100, 200, 500],
}

gmm = GaussianMixture()
grid_search = GridSearchCV(gmm, param_grid, cv=3, n_jobs=-1)
grid_search.fit(latent_embeddings_np)

best_model = grid_search.best_estimator_
print("Best Model: ", best_model)

print("Gaussian Means Shape:", best_model.means_.shape)
print("Covariances Shape:", best_model.covariances_.shape)

Best Model:  GaussianMixture(n_components=7)
Gaussian Means Shape: (7, 16)
Covariances Shape: (7, 16, 16)


In [8]:
true_labels = []
with torch.no_grad():
    for X, y in train_dataloader:
        true_labels.append(y.cpu().numpy())

In [9]:
true_labels = np.concatenate(true_labels, axis=0)

In [10]:
probs = best_model.predict_proba(latent_embeddings_np)
assignments = np.argmax(probs, axis=1)


In [11]:
print("Gerçek Etiketler (İlk 10):", true_labels[:5])
print("GMM Atamaları (İlk 10):", assignments[:5])

Gerçek Etiketler (İlk 10): [[2]
 [5]
 [6]
 [3]
 [3]]
GMM Atamaları (İlk 10): [6 1 1 3 0]


In [12]:
from sklearn.metrics import adjusted_rand_score

ari_score = adjusted_rand_score(true_labels.squeeze(1), assignments)
print("Adjusted Rand Index (ARI):", ari_score)

Adjusted Rand Index (ARI): 0.10220583752979834
