In [1]:
import numpy as np
import pandas as pd 

import re

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
df1 = pd.read_csv('../data/goemotions_1.csv')
df2 = pd.read_csv('../data/goemotions_2.csv')
df3 = pd.read_csv('../data/goemotions_3.csv')

df = pd.concat([df1, df2, df3]).reset_index(drop=True)

In [3]:
useful_cols = ['id','admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral']

In [4]:
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


- text: The text of the comment (with masked tokens, as described in the paper).
- id: The unique id of the comment.
- author: The Reddit username of the comment's author.
- subreddit: The subreddit that the comment belongs to.
- link_id: The link id of the comment.
- parent_id: The parent id of the comment.
- created_utc: The timestamp of the comment.
- rater_id: The unique id of the annotator.
- example_very_unclear: Whether the annotator marked the example as being very unclear or difficult to label (in this case they did not choose any emotion labels).

In [5]:
df.example_very_unclear.value_counts()

example_very_unclear
False    207814
True       3411
Name: count, dtype: int64

In [6]:
print('no of tweet ids:', df['id'].nunique())
print('no of unique rater ids:', df['rater_id'].nunique())
print('no of unique tweets:', df['text'].nunique())
print('unclear/ difficult to label tweets (from df):', np.round((df['example_very_unclear'].sum()*100 / len(df)), 2), '%')

aggregated = df[useful_cols].groupby('id').sum()
raters_2 = (aggregated >= 2).any(axis=1).sum()
raters_3 = (aggregated >= 3).any(axis=1).sum()

print("no of tweets where at least 2+ raters agree upon atleast 1 label:", raters_2)
print("no of tweets where at least 3+ raters agree upon atleast 1 label:", raters_3)

prop = df.groupby('id')['rater_id'].nunique().value_counts(normalize=True)*100
print("\n no of raters per tweet (id): \n", prop)

no of tweet ids: 58011
no of unique rater ids: 82
no of unique tweets: 57732
unclear/ difficult to label tweets (from df): 1.61 %
no of tweets where at least 2+ raters agree upon atleast 1 label: 54263
no of tweets where at least 3+ raters agree upon atleast 1 label: 17763

 no of raters per tweet (id): 
 rater_id
3    64.358484
5    30.873455
4     3.626898
2     1.020496
1     0.120667
Name: proportion, dtype: float64


In [7]:
emotion_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

reset_cols = ['id', 'text', 'admiration', 'amusement', 'anger', 'annoyance', 
       'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [8]:
def first_preprocessing(text):
    all_punctuations = '''!-{}\,<>./?@#$%^&*_~`|()'''
    # >1 same punctuations replaced by same punctuation
    cleaned_text = re.sub(f'([{re.escape(all_punctuations)}])\s*\\1*', r'\1 ', text)
    remove_punctuations = '''{}\<>/@#$%^&*_~`|()'''
    # removing less occuring/ noisy punctuations
    cleaned_text = ''.join(char for char in cleaned_text if char not in remove_punctuations)
    # remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    # make sure there is no extra space after sentence complete
    if len(cleaned_text) != 0:
        if cleaned_text[-1] == ' ':
            return cleaned_text[:-1]

    return cleaned_text

In [9]:
new_df = df.copy()
new_df = new_df[new_df['example_very_unclear']==False].reset_index(drop=True)
new_df = new_df[['id', 'text']][~new_df[['id']].duplicated()]
new_df = pd.merge(df[useful_cols].groupby('id').sum() >= 2, new_df, on='id')
new_df = new_df[new_df.drop(columns={'text', 'id'}).sum(axis=1) >= 1]

new_df = new_df[reset_cols]
new_df['text'] = new_df['text'].apply(first_preprocessing)
new_df[emotion_cols] = new_df[emotion_cols].astype(int)

df1 = new_df[~new_df.duplicated('text', keep=False)].reset_index(drop=True).drop(columns={'id'})
df2 = new_df[new_df.duplicated('text', keep=False)].reset_index(drop=True)
df2 = (df2.drop(columns={'id'})[df2.duplicated('text', keep=False)].groupby('text').sum() >= 2).reset_index()

final_df = pd.concat([df1, df2]).reset_index(drop=True)
print('No of examples after preprocessing:', len(final_df))

No of examples after preprocessing: 53951


In [10]:
l_index, u_index = [], []
for i, text in enumerate(final_df['text']):
    if len((text).split()) < 3:
        l_index.append(i)
    if len((text).split()) > 30:
        u_index.append(i)

print('No of texts with less than 3 words:', len(l_index))
print('No of texts with more than 30 words:', len(u_index))

No of texts with less than 3 words: 1117
No of texts with more than 30 words: 9


In [11]:
len(final_df)

53951

In [12]:
final_df.to_csv('../data/cleaned_data.csv', index=False)