In [11]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.cluster import KMeans
from gensim.models import Word2Vec

In [12]:
split_dir = Path('dataset') / 'scispacy_split'

In [13]:
X_train = np.load(split_dir / 'X_train.npy')
X_train_sent = np.load(split_dir / 'X_train_sent.npy')
X_test = np.load(split_dir / 'X_test.npy')
X_test_sent = np.load(split_dir / 'X_test_sent.npy')
y_train = np.load(split_dir / 'y_train.npy')
y_test = np.load(split_dir / 'y_test.npy')

len(X_train), len(X_test), len(y_train), len(y_test), len(X_train_sent), len(X_test_sent)

(400, 101, 400, 101, 400, 101)

In [14]:
X_train = [' '.join(row) for row in X_train]
X_test = [' '.join(row) for row in X_test]

X_train[0], X_test[0]

('I mildly no choice car windy trip weekend opinion carsickness Meclizine <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
 'I sleeping days stomach’s burning pretzels I hungry acid reflux throat drink water I throat I panic <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [15]:
skipgram_model = Word2Vec(sentences=X_train, vector_size=64, window=4, min_count=1, sg=1, workers=-1)

In [16]:
train_vec_size = 63

In [17]:
X_train_final = []
X_test_final = []

for i in tqdm(range(len(X_train))):
    text = X_train[i]
    sentiment = X_train_sent[i]

    tokens = text.split()
    vectors = [skipgram_model.wv[token] for token in tokens if token in skipgram_model.wv]

    if len(vectors) > train_vec_size:
        vectors = vectors[:train_vec_size]
    elif len(vectors) < train_vec_size:
        vectors += [np.zeros(64)] * (train_vec_size - len(vectors))

    combined_vector = np.concatenate((np.mean(vectors, axis=0), np.array([sentiment])))

    X_train_final.append(combined_vector)

for i in tqdm(range(len(X_test))):
    text = X_test[i]
    sentiment = X_test_sent[i]

    tokens = text.split()
    vectors = [skipgram_model.wv[token] for token in tokens if token in skipgram_model.wv]

    if len(vectors) > train_vec_size:
        vectors = vectors[:train_vec_size]
    elif len(vectors) < train_vec_size:
        vectors += [np.zeros(64)] * (train_vec_size - len(vectors))

    combined_vector = np.concatenate((np.mean(vectors, axis=0), np.array([sentiment])))

    X_test_final.append(combined_vector)

X_train_final = np.array(X_train_final)
X_train_final = X_train_final.astype(np.float64)
X_test_final = np.array(X_test_final)
X_test_final = X_test_final.astype(np.float64)

len(X_train_final), len(X_test_final), len(y_train), len(y_test)
    

100%|██████████| 400/400 [00:00<00:00, 6507.36it/s]
100%|██████████| 101/101 [00:00<00:00, 6207.50it/s]


(400, 101, 400, 101)

In [18]:
n_clusters = len(['Question', 'Needing support - Panic attack', 'Rant', 'Potentially Triggering', 'Does Anyone Else...?', 'Needing support: Just not feeling good'])

In [19]:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10, max_iter=300, verbose=1)

y_train_pred = kmeans.fit_predict(X_train_final)

Initialization complete
Iteration 0, inertia 3.1182375028348317.
Iteration 1, inertia 2.515953801631749.
Iteration 2, inertia 2.401292254780998.
Iteration 3, inertia 2.3493745394486156.
Iteration 4, inertia 2.3192778977748323.
Iteration 5, inertia 2.3079034100568427.
Iteration 6, inertia 2.289012988286173.
Iteration 7, inertia 2.2852574958550007.
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 3.0587812727936026.
Iteration 1, inertia 2.6742390171720256.
Iteration 2, inertia 2.5616377033410007.
Iteration 3, inertia 2.4966999687642666.
Iteration 4, inertia 2.4514931813296896.
Iteration 5, inertia 2.406733256802481.
Iteration 6, inertia 2.3449043981141817.
Iteration 7, inertia 2.310661931471672.
Iteration 8, inertia 2.2931318326993324.
Iteration 9, inertia 2.2854704716962226.
Iteration 10, inertia 2.2852574958550007.
Converged at iteration 10: strict convergence.
Initialization complete
Iteration 0, inertia 2.6749994906137156.
Iteration 1, inerti

In [20]:
from sklearn.metrics import silhouette_score
silhouette_train = silhouette_score(X_train_final, y_train_pred)
print(f'Silhouette score for training set: {silhouette_train}')

Silhouette score for training set: 0.644107968693191
