In [1]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np
import emoji
import re

In [2]:
posts_folder = Path('dataset') / 'emetophobia_posts'
with open(posts_folder / 'emetophobia_all_posts_one_label_normalized.json', 'r') as f:
    all_posts = json.load(f)

len(all_posts)

986

In [3]:
def minimal_normalziation(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    return text.strip()

In [4]:
for post in tqdm(all_posts):
    post['content'] = minimal_normalziation(post['content'])

100%|██████████| 986/986 [00:00<00:00, 986071.47it/s]


In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
analyzer = SentimentIntensityAnalyzer()

In [7]:
for post in tqdm(all_posts):
    post['sentiment'] = analyzer.polarity_scores(post['content'])

  0%|          | 0/986 [00:00<?, ?it/s]

100%|██████████| 986/986 [00:00<00:00, 1436.51it/s]


In [8]:
def extract_emojis(text):
    return [c for c in text if c in emoji.EMOJI_DATA]

In [9]:
for post in tqdm(all_posts):
    post['emojis'] = extract_emojis(post['content'])
    post['emoji_count'] = emoji.emoji_count(post['content'])

100%|██████████| 986/986 [00:00<00:00, 2106.04it/s]


In [10]:
emotions_dir = Path('sentiments')
with open(emotions_dir / 'emetophobia_posts_sent_emoji.json', 'w') as f:
    json.dump(all_posts, f, indent=4)

In [11]:
emoji_dict = {}

for post in tqdm(all_posts):
    for emo in post['emojis']:
        if emo not in emoji_dict:
            emoji_dict[emo] = [post['sentiment']['compound']]
        emoji_dict[emo].append(post['sentiment']['compound'])

len(emoji_dict)

100%|██████████| 986/986 [00:00<00:00, 2879932.97it/s]


82

In [12]:
with open(emotions_dir / 'emetophobia_posts_emoji_sentiment.json', 'w') as f:
    json.dump(emoji_dict, f, indent=4)

In [13]:
for k, v in emoji_dict.items():
    emoji_dict[k] = {
        "min": np.min(v),
        "max": np.max(v),
        "mean": np.mean(v),
        "std": np.std(v),
        'nums': len(v)
    }

In [14]:
emoji_dict

{'😭': {'min': -0.9966,
  'max': 0.9859,
  'mean': -0.6723615384615383,
  'std': 0.5776024064607778,
  'nums': 91},
 '😣': {'min': -0.981,
  'max': -0.624,
  'mean': -0.7576,
  'std': 0.14094314456545945,
  'nums': 6},
 '😔': {'min': -0.9178,
  'max': 0.036,
  'mean': -0.619875,
  'std': 0.3443881521989396,
  'nums': 8},
 '😳': {'min': -0.8317, 'max': -0.8317, 'mean': -0.8317, 'std': 0.0, 'nums': 2},
 '🥹': {'min': -0.9917,
  'max': -0.9433,
  'mean': -0.9755666666666668,
  'std': 0.02281597880628593,
  'nums': 3},
 '♥': {'min': -0.9917, 'max': -0.9917, 'mean': -0.9917, 'std': 0.0, 'nums': 2},
 '😅': {'min': 0.8903,
  'max': 0.9908,
  'mean': 0.935875,
  'std': 0.046052056143021464,
  'nums': 4},
 '\U0001fae9': {'min': -0.8558,
  'max': -0.7171,
  'mean': -0.7825249999999999,
  'std': 0.06566004778402161,
  'nums': 4},
 '😞': {'min': -0.957,
  'max': -0.228,
  'mean': -0.7666600000000001,
  'std': 0.2818976665387637,
  'nums': 5},
 '🫶': {'min': -0.9885,
  'max': 0.9413,
  'mean': 0.6723833333

In [15]:
df_emoji = pd.DataFrame.from_dict(emoji_dict, orient='index')
df_emoji.head()

Unnamed: 0,min,max,mean,std,nums
😭,-0.9966,0.9859,-0.672362,0.577602,91
😣,-0.981,-0.624,-0.7576,0.140943,6
😔,-0.9178,0.036,-0.619875,0.344388,8
😳,-0.8317,-0.8317,-0.8317,0.0,2
🥹,-0.9917,-0.9433,-0.975567,0.022816,3


In [16]:
df_emoji.to_excel(emotions_dir / 'emetophobia_emojis_statistics.xlsx', index=True, index_label='emoji')

In [17]:
from collections import Counter

In [18]:
all_posts[0]

{'title': 'Life-changing website for movie triggers!',
 'date': 1749890934.0,
 'ups': 2,
 'upvote_ratio': 1.0,
 'downvotes': 0,
 'labels': 'Techniques, tips and tricks',
 'sentiment': {'neg': 0.07, 'neu': 0.783, 'pos': 0.147, 'compound': 0.8563},
 'emojis': [],
 'emoji_count': 0}

In [19]:
lbl_counter = Counter()

for post in tqdm(all_posts):
    lbl_counter[post['labels']] += post['emoji_count']

lbl_counter.most_common(20)

100%|██████████| 986/986 [00:00<00:00, 2424140.53it/s]


[('Rant', 54),
 ('Needing support - Panic attack', 31),
 ('Needing support: Just not feeling good', 28),
 ('Question', 26),
 ('Success!', 20),
 ('Does Anyone Else...?', 17),
 ('Potentially Triggering', 17),
 ('Moderator', 11),
 ('None', 10),
 ('Needing Support - N, V, D etc', 10),
 ('Positive Reminder', 8),
 ('Techniques, tips and tricks', 6),
 ('Venting - Advice wanted', 6),
 ('Needing Support - In Acute Crisis (at risk of self injury)', 5),
 ('Venting - No advice please', 5),
 ('Needing Support - Anxious about FP', 4),
 ('It Happened (TW)', 3),
 ('Needing Support - N, V, D etc NO REASSURANCE', 3),
 ('Recovery', 2),
 ('Therapy info!', 0)]