In [1]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
ner_dir = Path('ner')
split_dir = Path('dataset') / 'biobert_split'

In [3]:
with open(ner_dir / 'emetophobia_posts_ner_biobert.json', 'r') as f:
    all_posts = json.load(f)
len(all_posts)

986

In [4]:
emotions_dir = Path('sentiments')
with open(emotions_dir / 'emetophobia_posts_sent_emoji.json', 'r') as f:
    emoji_posts = json.load(f)
len(emoji_posts)

986

In [5]:
X = []
y = []

accepted_labels = ['Question', 'Needing support - Panic attack', 'Rant', 'Potentially Triggering', 'Does Anyone Else...?', 'Needing support: Just not feeling good']

num_ents = []
for i in tqdm(range(len(all_posts))):
    post = all_posts[i]
    if post['labels'] in accepted_labels and len(post['entities']) > 8:
        ent_app = [e[0] for e in post['entities']]
        num_ents.append(len(ent_app))
        ent_app = ent_app[:128] + ['<PAD>'] * (128 - len(ent_app))

        sentiment = emoji_posts[i]['sentiment']['compound']

        ent_app.append(sentiment)

        X.append(ent_app)
        y.append(post['labels'])

X = np.array(X)
y = np.array(y)

label_counts = pd.Series(y).value_counts()
num_ents = np.array(num_ents)


label_counts, num_ents.mean(), num_ents.std(), num_ents.min(), num_ents.max()

100%|██████████| 986/986 [00:00<00:00, 773729.42it/s]


(Needing support - Panic attack            113
 Rant                                       84
 Question                                   77
 Potentially Triggering                     57
 Needing support: Just not feeling good     50
 Does Anyone Else...?                       42
 Name: count, dtype: int64,
 16.52245862884161,
 7.122955373012269,
 9,
 56)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

len(X_train), len(X_test), len(y_train), len(y_test)

(338, 85, 338, 85)

In [7]:
X_train_sent = []
X_train = X_train.tolist()
X_test_sent = []
X_test = X_test.tolist()


for i in range(len(X_train)):
    X_train_sent.append(X_train[i][-1])
    X_train[i] = X_train[i][:-1]

for i in range(len(X_test)):
    X_test_sent.append(X_test[i][-1])
    X_test[i] = X_test[i][:-1]

X_train = np.array(X_train)
X_test = np.array(X_test)
X_train_sent = np.array(X_train_sent)
X_test_sent = np.array(X_test_sent)

len(X_train), len(X_test), len(X_train_sent), len(X_test_sent)

(338, 85, 338, 85)

In [8]:
np.save(split_dir / 'X_train.npy', X_train)
np.save(split_dir / 'X_train_sent.npy', X_train_sent)
np.save(split_dir / 'X_test.npy', X_test)
np.save(split_dir / 'X_test_sent.npy', X_test_sent)
np.save(split_dir / 'y_train.npy', y_train)
np.save(split_dir / 'y_test.npy', y_test)