In [1]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.cluster import KMeans
from gensim.models import Word2Vec

In [2]:
split_dir = Path('dataset') / 'biobert_split'

In [3]:
X_train = np.load(split_dir / 'X_train.npy')
X_train_sent = np.load(split_dir / 'X_train_sent.npy')
X_test = np.load(split_dir / 'X_test.npy')
X_test_sent = np.load(split_dir / 'X_test_sent.npy')
y_train = np.load(split_dir / 'y_train.npy')
y_test = np.load(split_dir / 'y_test.npy')

len(X_train), len(X_test), len(y_train), len(y_test), len(X_train_sent), len(X_test_sent)

(338, 85, 338, 85, 338, 85)

In [4]:
X_train = [' '.join(row) for row in X_train]
X_test = [' '.join(row) for row in X_test]

X_train[0], X_test[0]

('em ##etophobia flying fly motion over the years husband 7 – 8 hour books 1 hour na husband regressed every day motion fly feel trapped <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
 'intense emetophobia freaking out close boba tea stomach ache panic tired few hours later went to sleep woke up house ill fine eat stomach rumble almost 1 in the afternoon 30 minutes sick z ##ofran pep sick <PAD>

In [5]:
skipgram_model = Word2Vec(sentences=X_train, vector_size=64, window=10, min_count=1, sg=1, workers=-1)

In [6]:
train_vec_size = 64

In [7]:
X_train_final = []
X_test_final = []

for i in tqdm(range(len(X_train))):
    text = X_train[i]
    sentiment = X_train_sent[i]

    tokens = text.split()
    vectors = [skipgram_model.wv[token] for token in tokens if token in skipgram_model.wv]

    if len(vectors) > train_vec_size:
        vectors = vectors[:train_vec_size]
    elif len(vectors) < train_vec_size:
        vectors += [np.zeros(64)] * (train_vec_size - len(vectors))

    combined_vector = np.concatenate((np.mean(vectors, axis=0), np.array([sentiment])))
    print(len(combined_vector), sentiment, len(vectors))

    X_train_final.append(combined_vector)

for i in tqdm(range(len(X_test))):
    text = X_test[i]
    sentiment = X_test_sent[i]

    tokens = text.split()
    vectors = [skipgram_model.wv[token] for token in tokens if token in skipgram_model.wv]

    if len(vectors) > train_vec_size:
        vectors = vectors[:train_vec_size]
    elif len(vectors) < train_vec_size:
        vectors += [np.zeros(64)] * (train_vec_size - len(vectors))

    combined_vector = np.concatenate((np.mean(vectors, axis=0), np.array([sentiment])))

    X_test_final.append(combined_vector)

X_train_final = np.array(X_train_final)
X_train_final = X_train_final.astype(np.float64)
X_test_final = np.array(X_test_final)
X_test_final = X_test_final.astype(np.float64)

# len(X_train_final), len(X_test_final), len(y_train), len(y_test), X_train_final[0]
    

100%|██████████| 338/338 [00:00<00:00, 7681.38it/s]


65 -0.7819 64
65 -0.9469 64
65 -0.9586 64
65 -0.7489 64
65 0.4194 64
65 -0.9179 64
65 0.296 64
65 -0.9843 64
65 0.7722 64
65 -0.823 64
65 0.3687 64
65 -0.9581 64
65 -0.98 64
65 -0.8762 64
65 -0.9589 64
65 -0.9511 64
65 -0.981 64
65 -0.8628 64
65 -0.9852 64
65 -0.6124 64
65 -0.8976 64
65 -0.9678 64
65 -0.9442 64
65 -0.962 64
65 -0.5858 64
65 -0.9774 64
65 -0.7619 64
65 0.8733 64
65 -0.9597 64
65 -0.4401 64
65 -0.8182 64
65 0.2444 64
65 -0.7684 64
65 -0.984 64
65 -0.8664 64
65 -0.8534 64
65 0.2991 64
65 -0.6948 64
65 0.6936 64
65 -0.5472 64
65 -0.9194 64
65 -0.8878 64
65 0.4056 64
65 -0.8053 64
65 0.7305 64
65 -0.9588 64
65 -0.6329 64
65 -0.8885 64
65 -0.9166 64
65 -0.9818 64
65 -0.963 64
65 -0.5341 64
65 -0.9152 64
65 -0.8016 64
65 -0.9933 64
65 0.9334 64
65 -0.898 64
65 -0.8844 64
65 -0.9961 64
65 -0.8526 64
65 0.5927 64
65 0.7992 64
65 -0.296 64
65 -0.7269 64
65 0.1027 64
65 -0.9887 64
65 -0.9769 64
65 -0.8335 64
65 -0.9029 64
65 -0.3919 64
65 -0.6261 64
65 -0.1178 64
65 -0.9528 64
65

100%|██████████| 85/85 [00:00<00:00, 9923.89it/s]


In [8]:
n_clusters = len(['Question', 'Needing support - Panic attack', 'Rant', 'Potentially Triggering', 'Does Anyone Else...?', 'Needing support: Just not feeling good'])

In [9]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=300, verbose=1)

y_train_pred = kmeans.fit_predict(X_train_final)

Initialization complete
Iteration 0, inertia 4.0297757189224805.
Iteration 1, inertia 2.228868438212303.
Iteration 2, inertia 2.108934177114295.
Iteration 3, inertia 2.089351329429714.
Iteration 4, inertia 2.0808854640302297.
Iteration 5, inertia 2.0739314279404164.
Iteration 6, inertia 2.0630802719314896.
Iteration 7, inertia 2.0574672758099495.
Iteration 8, inertia 2.05199262958622.
Iteration 9, inertia 2.0381306896138787.
Iteration 10, inertia 2.036892043031641.
Iteration 11, inertia 2.0361440763871212.
Iteration 12, inertia 2.0360331203334043.
Converged at iteration 12: strict convergence.
Initialization complete
Iteration 0, inertia 2.3039618526694103.
Iteration 1, inertia 1.9653779013536983.
Iteration 2, inertia 1.8630965923517853.
Iteration 3, inertia 1.8394791970596622.
Iteration 4, inertia 1.837362402185891.
Iteration 5, inertia 1.831513782770773.
Iteration 6, inertia 1.8296351719514456.
Iteration 7, inertia 1.8293381008498193.
Iteration 8, inertia 1.8289248638384588.
Iteratio

In [10]:
from sklearn.metrics import silhouette_score
silhouette_train = silhouette_score(X_train_final, y_train_pred)
print(f'Silhouette score for training set: {silhouette_train}')

Silhouette score for training set: 0.634794811808537
