In [1]:
import pandas as pd
from pathlib import Path
import json
import numpy as np
from tqdm import tqdm

from scipy.stats import ttest_rel

In [2]:
sentiments_dir = Path('sentiments')

df_emoji = pd.read_excel(sentiments_dir / 'emetophobia_emojis_statistics.xlsx')
df_no_emoji = pd.read_excel(sentiments_dir / 'emetophobia_noemojis_statistics.xlsx')

In [3]:
df_emoji.head()

Unnamed: 0,emoji,min,max,mean,std,nums
0,😭,-0.9966,0.9859,-0.672362,0.577602,91
1,😣,-0.981,-0.624,-0.7576,0.140943,6
2,😔,-0.9178,0.036,-0.619875,0.344388,8
3,😳,-0.8317,-0.8317,-0.8317,0.0,2
4,🥹,-0.9917,-0.9433,-0.975567,0.022816,3


In [4]:
with open(sentiments_dir / 'emetophobia_posts_emoji_sentiment.json', 'r') as f:
    emoji_dict = json.load(f)
with open(sentiments_dir / 'emetophobia_posts_noemoji_sentiment.json', 'r') as f:
    noemoji_dict = json.load(f)

In [5]:
df_diff = df_emoji.merge(df_no_emoji, on='emoji', how='inner', suffixes=('_with_emoji', '_without_emoji'))

df_diff['mean_difference'] = df_diff['mean_with_emoji'] - df_diff['mean_without_emoji']

# t_stat, p_value = ttest_rel(df_diff['mean_with_emoji'], df_diff['mean_without_emoji'])
for i in tqdm(range(len(df_diff))):
    emoji = df_diff.iloc[i]['emoji']
    if emoji in emoji_dict and emoji in noemoji_dict:
        t_stat, p_value = ttest_rel(emoji_dict[emoji], noemoji_dict[emoji])
        df_diff.at[i, 't_stat'] = t_stat
        df_diff.at[i, 'p_value'] = p_value
    else:
        df_diff.at[i, 't_stat'] = np.nan
        df_diff.at[i, 'p_value'] = np.nan
df_diff = df_diff.sort_values(by='nums_with_emoji', ascending=False)


  0%|          | 0/82 [00:00<?, ?it/s]

  res = hypotest_fun_out(*samples, **kwds)
100%|██████████| 82/82 [00:00<00:00, 622.95it/s]


In [6]:
df_diff.head()

Unnamed: 0,emoji,min_with_emoji,max_with_emoji,mean_with_emoji,std_with_emoji,nums_with_emoji,min_without_emoji,max_without_emoji,mean_without_emoji,std_without_emoji,nums_without_emoji,mean_difference,t_stat,p_value
0,😭,-0.9966,0.9859,-0.672362,0.577602,91,-0.9954,0.9876,-0.444981,0.664632,91,-0.22738,-6.269718,1.228922e-08
9,🫶,-0.9885,0.9413,0.672383,0.536214,12,-0.9851,0.9505,0.599492,0.614665,12,0.072892,1.206261,0.2530206
25,❤,-0.9965,0.8932,-0.21249,0.780557,10,-0.9956,0.8932,-0.21226,0.780327,10,-0.00023,-1.943084,0.0838931
14,🥲,-0.9838,-0.7619,-0.883778,0.070573,9,-0.9867,-0.7619,-0.8841,0.071034,9,0.000322,1.0,0.3465935
36,💔,-0.9195,0.0377,-0.6074,0.395321,8,-0.9456,-0.4659,-0.72555,0.188329,8,0.11815,1.238247,0.2555378


In [7]:
df_diff.to_excel(sentiments_dir / 'emetophobia_diff_statistics.xlsx', index=False)

In [8]:
all_sents_emoji = []
for emoji in emoji_dict:
    all_sents_emoji.extend(emoji_dict[emoji])

all_sents_noemoji = []
for emoji in noemoji_dict:
    all_sents_noemoji.extend(noemoji_dict[emoji])

t_stat, p_value = ttest_rel(all_sents_emoji, all_sents_noemoji)

t_stat, p_value

(-4.255487935548932, 2.660058870348046e-05)

In [9]:
with open(sentiments_dir / 'emetophobia_posts_sent_emoji.json', 'r') as f:
    posts_emoji = json.load(f)

with open(sentiments_dir / 'emetophobia_posts_sent_noemoji.json', 'r') as f:
    posts_noemoji = json.load(f)

posts_emoji[0], posts_noemoji[0]

({'title': 'Life-changing website for movie triggers!',
  'date': 1749890934.0,
  'ups': 2,
  'upvote_ratio': 1.0,
  'downvotes': 0,
  'labels': 'Techniques, tips and tricks',
  'sentiment': {'neg': 0.07, 'neu': 0.783, 'pos': 0.147, 'compound': 0.8563},
  'emojis': [],
  'emoji_count': 0},
 {'title': 'Life-changing website for movie triggers!',
  'date': 1749890934.0,
  'ups': 2,
  'upvote_ratio': 1.0,
  'downvotes': 0,
  'labels': 'Techniques, tips and tricks',
  'sentiment': {'neg': 0.07, 'neu': 0.783, 'pos': 0.147, 'compound': 0.8563},
  'emojis': [],
  'emoji_count': 0})

In [10]:
all_sents_emoji = []
lbl_sents_emoji = {}

all_sents_noemoji = []
lbl_sents_noemoji = {}

for post in tqdm(posts_emoji):
    all_sents_emoji.append(post['sentiment']['compound'])

    if post['labels'] in lbl_sents_emoji:
        lbl_sents_emoji[post['labels']].append(post['sentiment']['compound'])
    else:
        lbl_sents_emoji[post['labels']] = [post['sentiment']['compound']]

for post in tqdm(posts_noemoji):
    all_sents_noemoji.append(post['sentiment']['compound'])

    if post['labels'] in lbl_sents_noemoji:
        lbl_sents_noemoji[post['labels']].append(post['sentiment']['compound'])
    else:
        lbl_sents_noemoji[post['labels']] = [post['sentiment']['compound']]

100%|██████████| 986/986 [00:00<00:00, 1758326.42it/s]
100%|██████████| 986/986 [00:00<00:00, 1717435.11it/s]


In [11]:
all_sents = {
    'max_with_emoji': np.max(all_sents_emoji),
    'min_with_emoji': np.min(all_sents_emoji),
    'mean_with_emoji': np.mean(all_sents_emoji),
    'std_with_emoji': np.std(all_sents_emoji),
    'max_without_emoji': np.max(all_sents_noemoji),
    'min_without_emoji': np.min(all_sents_noemoji),
    'mean_without_emoji': np.mean(all_sents_noemoji),
    'std_without_emoji': np.std(all_sents_noemoji),
}


t_stat_all, p_value_all = ttest_rel(all_sents_emoji, all_sents_noemoji)

all_sents['t_value'] = t_stat_all
all_sents['p_value'] = p_value_all

df_all_sents = pd.DataFrame.from_dict(all_sents, orient='index', columns=['value'])

t_stat_all, p_value_all

(-3.1540444865214092, 0.0016590158848331148)

In [12]:
df_all_sents.to_excel(sentiments_dir / 'emetophobia_all_sent_statistics.xlsx', index=True, index_label='metric')

In [13]:
lbl_sents = []
for lbl in lbl_sents_emoji:
    if len(lbl_sents_emoji[lbl]) <= 1:
        continue
    
    if lbl in lbl_sents_noemoji:
        t_stat, p_value = ttest_rel(lbl_sents_emoji[lbl], lbl_sents_noemoji[lbl])
    else:
        t_stat, p_value = np.nan, np.nan

    lbl_sents.append({
        'label': lbl,
        'mean_with_emoji': np.mean(lbl_sents_emoji[lbl]),
        'std_with_emoji': np.std(lbl_sents_emoji[lbl]),
        'mean_without_emoji': np.mean(lbl_sents_noemoji[lbl]),
        'std_without_emoji': np.std(lbl_sents_noemoji[lbl]),
        't_stat': t_stat,
        'p_value': p_value
    })

df_lbl_sents = pd.DataFrame.from_records(lbl_sents)

df_lbl_sents.to_excel(sentiments_dir / 'emetophobia_label_sent_statistics.xlsx', index=False)