In [1]:
from tqdm import tqdm
import json
import pandas as pd
from pathlib import Path
import numpy as np
import emoji
import re

In [2]:
posts_folder = Path('dataset') / 'emetophobia_posts'
with open(posts_folder / 'emetophobia_all_posts_one_label_normalized.json', 'r') as f:
    all_posts = json.load(f)

for post in tqdm(all_posts):
    post['content_emoji'] = post['content']
len(all_posts)

100%|██████████| 986/986 [00:00<00:00, 2863977.66it/s]


986

In [3]:
def minimal_normalziation_noemoji(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = emoji.replace_emoji(text, replace='')
    return text.strip()

In [4]:
for post in tqdm(all_posts):
    post['content'] = minimal_normalziation_noemoji(post['content'])

100%|██████████| 986/986 [00:00<00:00, 3116.18it/s]


In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
analyzer = SentimentIntensityAnalyzer()

In [7]:
for post in tqdm(all_posts):
    post['sentiment'] = analyzer.polarity_scores(post['content'])

100%|██████████| 986/986 [00:00<00:00, 1600.57it/s]


In [8]:
def extract_emojis(text):
    return [c for c in text if c in emoji.EMOJI_DATA]

In [9]:
for post in tqdm(all_posts):
    post['emojis'] = extract_emojis(post['content_emoji'])
    post['emoji_count'] = emoji.emoji_count(post['content_emoji'])

100%|██████████| 986/986 [00:00<00:00, 3304.74it/s]


In [10]:
emotions_dir = Path('sentiments')
with open(emotions_dir / 'emetophobia_posts_sent_noemoji.json', 'w') as f:
    json.dump(all_posts, f, indent=4)

In [11]:
emoji_dict = {}

for post in tqdm(all_posts):
    for emo in post['emojis']:
        if emo not in emoji_dict:
            emoji_dict[emo] = [post['sentiment']['compound']]
        emoji_dict[emo].append(post['sentiment']['compound'])

len(emoji_dict)

100%|██████████| 986/986 [00:00<00:00, 2645926.90it/s]


82

In [13]:
with open(emotions_dir / 'emetophobia_posts_noemoji_sentiment.json', 'w') as f:
    json.dump(emoji_dict, f, indent=4)

In [12]:
for k, v in emoji_dict.items():
    emoji_dict[k] = {
        "min": np.min(v),
        "max": np.max(v),
        "mean": np.mean(v),
        "std": np.std(v),
        'nums': len(v)
    }

In [13]:
emoji_dict

{'😭': {'min': -0.9954,
  'max': 0.9876,
  'mean': -0.44498131868131874,
  'std': 0.6646320085535043,
  'nums': 91},
 '😣': {'min': -0.9616,
  'max': -0.624,
  'mean': -0.7598000000000001,
  'std': 0.13975206617435035,
  'nums': 6},
 '😔': {'min': -0.9246,
  'max': -0.0799,
  'mean': -0.5935999999999999,
  'std': 0.3227668508381863,
  'nums': 8},
 '😳': {'min': -0.8317, 'max': -0.8317, 'mean': -0.8317, 'std': 0.0, 'nums': 2},
 '🥹': {'min': -0.9926,
  'max': -0.9433,
  'mean': -0.9761666666666667,
  'std': 0.023240242874997867,
  'nums': 3},
 '♥': {'min': -0.9926, 'max': -0.9926, 'mean': -0.9926, 'std': 0.0, 'nums': 2},
 '😅': {'min': 0.8087,
  'max': 0.9892,
  'mean': 0.891575,
  'std': 0.08352871886363396,
  'nums': 4},
 '\U0001fae9': {'min': -0.8558,
  'max': -0.7171,
  'mean': -0.7825249999999999,
  'std': 0.06566004778402161,
  'nums': 4},
 '😞': {'min': -0.9278,
  'max': 0.2944,
  'mean': -0.58858,
  'std': 0.4718585059104053,
  'nums': 5},
 '🫶': {'min': -0.9851,
  'max': 0.9505,
  'mea

In [14]:
df_emoji = pd.DataFrame.from_dict(emoji_dict, orient='index')
df_emoji.head()

Unnamed: 0,min,max,mean,std,nums
😭,-0.9954,0.9876,-0.444981,0.664632,91
😣,-0.9616,-0.624,-0.7598,0.139752,6
😔,-0.9246,-0.0799,-0.5936,0.322767,8
😳,-0.8317,-0.8317,-0.8317,0.0,2
🥹,-0.9926,-0.9433,-0.976167,0.02324,3


In [15]:
df_emoji.to_excel(emotions_dir / 'emetophobia_noemojis_statistics.xlsx', index=True, index_label='emoji')