In [1]:
import json
from tqdm import tqdm
def dump_jsonl(output_path, data, append=False, progress=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        if progress:
            data = tqdm(data)
            
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path, verbose=True, progress=False) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        if progress:
            f = tqdm(f)
            
        for line in f:
                data.append(json.loads(line.rstrip('\n|\r')))
    
    if verbose:
        print('Loaded {} records from {}'.format(len(data), input_path))
        
    return data

In [2]:
emojis_list = {
    "thai-emojification": "❤️😄😞🍴⚾", # https://github.com/kobkrit/thai-emojification
    "top10-2023": "😂😍😘👌😭😒😊😩😁😏", # top 10 2023  https://blog.emojipedia.org/10-years-of-emojipedia-10-years-of-record-breaking-emoji-popularity/
    "top10-2013": "😂🤣❤️🙏😭😍✨🔥😊🥰", # top 10 2013  https://blog.emojipedia.org/10-years-of-emojipedia-10-years-of-record-breaking-emoji-popularity/
    "semeval2018-task2": "❤😍😂💕😊😘💪😉👌🇪🇸😎💙💜😜💞✨🎶💘", # SemEval-2018 Task 2, Multilingual Emoji Prediction
    "emotion": "😤😡😰😱☺😆😢😭", # grief, fear, rage, and love: https://psycnet.apa.org/record/2005-04422-003; James, W. (1922). The emotions. In C. G. Lange & W. James (Eds.), The emotions, Vol. 1, pp. 93–135). Williams & Wilkins Co. https://doi.org/10.1037/10735-003
}

In [3]:
# https://aclanthology.org/2020.emnlp-main.720/
# EmoTag1200: Understanding the Association between Emojis and Emotions
import pandas as pd
scores = pd.read_csv("./_private/EmoTag1200-scores.csv")

emojis_list["EmoTag1200"] = ""
emo = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
for e in emo:
    rows = scores.sort_values(e, ascending = False).head(3).to_dict('records')
    print(e, [r["emoji"] for r in rows])
    emojis_list["EmoTag1200"] += "".join([r["emoji"] for r in rows])

anger ['😠', '😡', '😤']
anticipation ['👀', '💭', '💰']
disgust ['👎', '😖', '😣']
fear ['😨', '😱', '😰']
joy ['☺', '😆', '😹']
sadness ['😢', '😭', '💔']
surprise ['‼', '❗', '😱']
trust ['😙', '💕', '🌹']


In [4]:
# Generate twitter search query urls from 2020-2024

import emoji
import random

urls = []
emojis_unique_list = {}
emojis_str = ""
for corpus_name in emojis_list:
    emojis_str += emojis_list[corpus_name]

for e in emoji.emoji_list(emojis_str):
    if e['emoji'] in emojis_unique_list:
        continue
    
    emojis_unique_list[e['emoji']] = e
    for year in [2020, 2021, 2022, 2023, 2024]:
        for m in range(1, 13):
            mm = str(m)
            if m < 10:
                mm = f"0{m}"
            url = f"https://twitter.com/search?q=({e['emoji']})%20lang%3Ath%20until%3A{year}-{mm}-28%20since%3A{year}-{mm}-01%20-filter%3Areplies"
            urls.append(url)
            
# random.shuffle(urls)
# import pandas as pd
# pd.DataFrame({"url": urls}).to_csv("urls.csv", index=False)

## Load Tweet Data

In [5]:
tweets = []

In [6]:
tweetsv1 = load_jsonl("_private/tweets.jsonl")
for tw in tweetsv1:
    if "id" not in tw:
        continue
        
    tweets.append({
        "id": tw["id"],
        "text": tw["text"],
        "created_at": tw["created_at"],
        "author_id": tw["author_id"],
        "source": "v1",
    })

Loaded 185750 records from _private/tweets.jsonl


In [7]:
tweetsv2 = load_jsonl("_private/tweets_with_keywords.jsonl")

from datetime import datetime
for tw in tweetsv2:
    d = datetime.strptime(tw["timestamp"], '%b %d, %Y · %H:%M %p %Z')
    tweets.append({
        "id": tw["id"],
        "text": tw["text"],
        "created_at": d.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
        "author": tw["username"],
        "source": "v2",
    })

Loaded 18481 records from _private/tweets_with_keywords.jsonl


In [8]:
import pandas as pd

from os import listdir
from os.path import isfile, join

tweetsv3 = []
for f in listdir("_private/Data"):
    if isfile(join("_private/Data", f)):
        d = pd.read_csv(join("_private/Data", f))
        tweetsv3 += d.to_dict('records')
        

for tw in tweetsv3:
    try:
        d = datetime.strptime(tw["published_at"], '%Y-%m-%d %H:%M:%S+00')

        tweets.append({
            "id": tw["internal_unique_id"],
            "text": tw["content"],
            "created_at": d.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
            "author_id": tw["user_id"],
            "source": "v3",
        })
    except:
        continue

In [34]:
from pythainlp.util import countthai
from ekphrasis.classes.preprocessor import TextPreProcessor
from tqdm import tqdm
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'],
    # NB: we do not annotate terms
    annotate={"hashtag"},
    fix_html=True, 
    segmenter="twitter",
    corrector="twitter",
    tokenizer=None,
)

filteredTweets = {}
for tw in tqdm(tweets):
    if countthai(tw["text"]) < 50:
        tw["removed"] = True
        continue
        
    tw["preprocessed"] = text_processor.pre_process_doc(tw["text"])
    
    if tw["id"] in filteredTweets:
        tw["removed"] = True
        continue
    
    if tw["preprocessed"] in filteredTweets:
        tw["removed"] = True
        continue
    
    tw["removed"] = False
    filteredTweets[tw["id"]] = True
    filteredTweets[tw["preprocessed"]] = True

In [10]:
text_processor.pre_process_doc("แมวววววว")

'แมวววววว'

In [28]:
df = pd.DataFrame(tweets)

In [42]:
import emoji

# emoji in emojis_str is not unique
tweetsByEmojiDict = {}
for e in emoji.emoji_list(emojis_str):
    tweetsByEmojiDict[e["emoji"]] = {}

dd = df[~df["removed"]]
for idx, row in tqdm(dd.iterrows(), total=len(dd)):
    for e in emoji.emoji_list(emojis_str):
        if e["emoji"] in row["text"]:
            tweetsByEmojiDict[e["emoji"]][row["id"]] = row.to_dict()

100%|███████████████████████████████████████████████| 205897/205897 [01:23<00:00, 2469.80it/s]


In [48]:
len(df), len(dd), len(dd)*100/len(df)

(321818, 205897, 63.97932993182482)

In [49]:
tweetsByEmoji = {}
for e in emoji.emoji_list(emojis_str):
    tweetsByEmoji[e["emoji"]] = list(tweetsByEmojiDict[e["emoji"]].values())

#### Top numbers

In [50]:
tweetsByEmoji = {k: v for k, v in sorted(tweetsByEmoji.items(), key=lambda item: -len(item[1]))}

cc = 0
for e in tweetsByEmoji:
    print(e, len(tweetsByEmoji[e]), emoji.emoji_list(e))
    cc+= 1
    if cc >= 10:
        break

😂 6983 [{'match_start': 0, 'match_end': 1, 'emoji': '😂'}]
😭 6307 [{'match_start': 0, 'match_end': 1, 'emoji': '😭'}]
❤ 6238 [{'match_start': 0, 'match_end': 1, 'emoji': '❤'}]
❤️ 4587 [{'match_start': 0, 'match_end': 2, 'emoji': '❤️'}]
😰 4477 [{'match_start': 0, 'match_end': 1, 'emoji': '😰'}]
😤 4041 [{'match_start': 0, 'match_end': 1, 'emoji': '😤'}]
🥰 3998 [{'match_start': 0, 'match_end': 1, 'emoji': '🥰'}]
🤣 3813 [{'match_start': 0, 'match_end': 1, 'emoji': '🤣'}]
🙏 3793 [{'match_start': 0, 'match_end': 1, 'emoji': '🙏'}]
✨ 3634 [{'match_start': 0, 'match_end': 1, 'emoji': '✨'}]


## Export them into a file

In [53]:
from collections import Counter

for corpus_name in emojis_list:
    print(corpus_name)
    
    corpus = []
    labels = {}
    for e in emoji.emoji_list(emojis_list[corpus_name]):
        for row in tweetsByEmoji[e["emoji"]]:
            if row["id"] in labels:
                labels[row["id"]].add(e["emoji"])
            else:
                labels[row["id"]] = set([e["emoji"]])
    
    count = Counter()
    for e in emoji.emoji_list(emojis_list[corpus_name]):
        for row in tweetsByEmoji[e["emoji"]]:
            corpus.append({**row, "label": list(labels[row["id"]])})
            
            count[len(labels[row["id"]])] += 1
    
    
    print("Total records:", len(corpus))
    print("Count #labels:", count)
    
    dump_jsonl(f"annotated/tweets_{corpus_name}.jsonl", corpus)
    print()

thai-emojification
Total records: 9132
Count #labels: Counter({1: 8978, 2: 154})
Wrote 9132 records to annotated/tweets_thai-emojification.jsonl

top10-2023
Total records: 26620
Count #labels: Counter({1: 23352, 2: 2782, 3: 390, 4: 60, 5: 30, 6: 6})
Wrote 26620 records to annotated/tweets_top10-2023.jsonl

top10-2013
Total records: 40775
Count #labels: Counter({1: 29547, 2: 8838, 3: 1971, 4: 332, 5: 50, 6: 30, 7: 7})
Wrote 40775 records to annotated/tweets_top10-2013.jsonl

semeval2018-task2
Total records: 39468
Count #labels: Counter({1: 26808, 2: 9074, 3: 2583, 4: 640, 5: 175, 6: 144, 7: 28, 8: 16})
Wrote 39468 records to annotated/tweets_semeval2018-task2.jsonl

emotion
Total records: 29083
Count #labels: Counter({1: 26109, 2: 2596, 3: 291, 4: 72, 5: 15})
Wrote 29083 records to annotated/tweets_emotion.jsonl

EmoTag1200
Total records: 48874
Count #labels: Counter({1: 38914, 2: 8420, 3: 1179, 4: 222, 5: 90, 6: 26, 7: 23})
Wrote 48874 records to annotated/tweets_EmoTag1200.jsonl



In [70]:
# corpus[0]

## Emotion Corpus

In [54]:
emotions = {"grief": "😢😭", "fear": "😰😱", "rage": "😤😡", "love": "☺😆"}
feelings = {}
for emo in emotions:
    n = 0
    for e in emoji.emoji_list(emotions[emo]):
        n += len(tweetsByEmoji[e["emoji"]])
        feelings[e["emoji"]] = emo
        
    print(emo, emotions[emo], n)

grief 😢😭 7660
fear 😰😱 7971
rage 😤😡 6963
love ☺😆 6489


In [55]:
nEmoji = []
dd = df[~df["removed"]]
selectedSentences = []
for idx, row in tqdm(dd.iterrows(), total=len(dd)):
    cnt = 0
    emojiInSentence = None
    for e in emoji.emoji_list(emojis_list["emotion"]):
        if e["emoji"] in row["text"]:
            cnt += 1
            emojiInSentence = e
            
    nEmoji.append(cnt)
    if cnt==1:
        r = {**row.to_dict(), **emojiInSentence, "feeling": feelings[emojiInSentence["emoji"]]}
        selectedSentences.append(r)

dd.loc[:, "nEmoji"] = nEmoji
dd[["id", "nEmoji"]].groupby("nEmoji").count()

100%|██████████████████████████████████████████████| 205897/205897 [00:17<00:00, 11694.70it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dd.loc[:, "nEmoji"] = nEmoji


Unnamed: 0_level_0,id
nEmoji,Unnamed: 1_level_1
0,178372
1,26109
2,1298
3,97
4,18
5,3


In [56]:
nd = pd.DataFrame(selectedSentences)

In [65]:
nd.groupby(["feeling"]).count()

Unnamed: 0_level_0,id,text,created_at,author_id,source,preprocessed,removed,author,match_start,match_end,emoji
feeling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fear,7055,7055,7055,7028,7055,7055,7055,27,7055,7055,7055
grief,6746,6746,6746,6296,6746,6746,6746,450,6746,6746,6746
love,6105,6105,6105,5944,6105,6105,6105,161,6105,6105,6105
rage,6203,6203,6203,6186,6203,6203,6203,17,6203,6203,6203


In [66]:
# nd.to_csv("annotated_misp/unannotated_raw.csv", index=False)

In [122]:
nd_balanced = []
for f in nd["feeling"].unique():
    rows = nd[nd["feeling"]==f].sample(n=2500)
    nd_balanced.append(rows)

nd_balanced = pd.concat(nd_balanced)

In [123]:
nd_balanced.groupby(["feeling"]).count()

Unnamed: 0_level_0,id,text,created_at,author_id,source,preprocessed,removed,author,match_start,match_end,emoji
feeling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fear,2500,2500,2500,2488,2500,2500,2500,12,2500,2500,2500
grief,2500,2500,2500,2107,2500,2500,2500,393,2500,2500,2500
love,2500,2500,2500,2352,2500,2500,2500,148,2500,2500,2500
rage,2500,2500,2500,2491,2500,2500,2500,9,2500,2500,2500


In [129]:
nd_balanced = nd_balanced.sample(frac=1)
nd_balanced.reset_index(inplace=True)
# nd_balanced.to_csv("annotated_misp/unannotated_balanced.csv", index=False)

In [131]:
# nd_balanced

In [67]:
from docx import Document 

nFile = 10
perFile = len(nd_balanced)//nFile
for i in range(nFile):

    startIdx = i*perFile
    endIdx = (i+1)*perFile
    
    document = Document()
    table = document.add_table(rows = 1 , cols = 1)
#     table.style = 'Table Grid'
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'ประโยค'     

    cc = 0
    for idx, row in tqdm(nd_balanced.iterrows(), total=nd_balanced.shape[0]):
        if idx < startIdx:
            continue
        if idx >= endIdx:
            continue
        
        cc += 1
        row_cells = table.add_row().cells 
        row_cells[0].text = row["preprocessed"]
#     document.save(f'unannotated_balanced_p{i}.docx')

In [138]:
nd_balanced.iloc[0:1000].groupby("feeling").count()

Unnamed: 0_level_0,level_0,index,id,text,created_at,author_id,source,preprocessed,removed,author,match_start,match_end,emoji
feeling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fear,280,280,280,280,280,277,280,280,280,3,280,280,280
grief,232,232,232,232,232,197,232,232,232,35,232,232,232
love,263,263,263,263,263,249,263,263,263,14,263,263,263
rage,225,225,225,225,225,224,225,225,225,1,225,225,225
