In [1]:
import numpy as np
import pandas as pd
import csv

In [2]:
# Load data and inspection

df = pd.read_csv(r"D:\SLU\AI MSc\Fall 22\NLP\train-v2.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
df.head()

Unnamed: 0,0,1
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [3]:
# Rename columns

df.rename(columns={0:"Class", 1:"Document"}, inplace=True)
df.head()

Unnamed: 0,Class,Document
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [4]:
# Add columns for having sad or happy emoji

df["Sad_emoji"] = None
df["Happy_emoji"] = None
df.head()

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...,,
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...,,
2,0,@USER yeaah ma fe yn wir. ( oh well.,,
3,1,@USER hahaha idk. 3am oedd y bws ti?,,
4,0,@USER dwim yn gal llun ohoni?,,


In [5]:
# Defining happy and sad emojis lists

happy_emojis = ["😀", "😃", "😄", "😁", "😆", "😅", "🤣", "😂", "🙂", "😉", "😊", "😇", "🥰", "😍", "🤩", "😘", "😗", "☺", "😚", "😙", "😋", "😛", "😜", 
"🤪", "😝", "🤗", "🤠", "🥳", "😎", "😺", "😸", "😹", "😻", "♥", "❤", "💕", "💖", "💙", "💛", "🧡", "💜", "💝", "👌", "🤟", "👊", '🍺', '💗'
"👍", "🤝", "🙏", "😬", "🤤", "💋", "❣", "✌", '<3', ':)', "👏", "🎉", '🎁', '💏', '🍰', '😏', '🙈', '👋', '🍸', '🎂', '💃', '💪🏻', '🌺', '🍷', '🍦']
sad_emojis = ["😐", "😶", "😑", "😒", "😔", "🤢", "🤮", "😕", "😖", "🥺", "😫", "😞", "😭", "😟", "😓", "😩", "😢", "🙁", "😥", 
"😣", "☹", "😰", "😤", "😡", "🤬", "😠", "💩", "😿", "😾", "🖤", "💔", "🖕", "👎", "🤦", '🚫', '💀', '🔫', ':(', '🙃']

In [6]:
# Defining two classes of "Happy" and "Sad" by emojis

Classes = {"Happy": happy_emojis, "Sad": sad_emojis}

In [7]:
# Finding all "Happy" and "Sad" emojis in documents

for i in range(len(df)):
    num_sad, num_happy = 0, 0   # Setting emoji counters to 0
    for h in range(len(Classes['Happy'])):  # check how many "Happy" emojis are in document
        if Classes['Happy'][h] in df.loc[i, 'Document']:
            num_happy += 1
    for s in range(len(Classes['Sad'])):    # check how many "Sad" emojis are in document
        if Classes['Sad'][s] in df.loc[i, 'Document']:
            num_sad += 1
    df.loc[i, 'Happy_emoji'] = num_happy
    df.loc[i, 'Sad_emoji'] = num_sad
df.head()

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...,0,0
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...,0,0
2,0,@USER yeaah ma fe yn wir. ( oh well.,0,0
3,1,@USER hahaha idk. 3am oedd y bws ti?,0,0
4,0,@USER dwim yn gal llun ohoni?,0,0


In [8]:
df.to_csv('dataframe', sep='\t', header=True, index=False, encoding='utf-8')

In [14]:
# Check the number of observations per class

print('Number of observation with sad emoji:', (len(df[df['Sad_emoji']>0])))
print('Number of observation with happy emoji:', (len(df[df['Happy_emoji']>0])))

Number of observation with sad emoji: 996
Number of observation with happy emoji: 8809


In [15]:
# Dropping rows without emojis

df_clean = df.drop(df[(df.Sad_emoji == 0) & (df.Happy_emoji == 0)].index)
df_clean

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji
8,1,"@USER mor browd, fe fyddai yn dangos ti faint ...",0,2
13,0,@USER hm dwi'r un peth! genai cur yn pen 24/7 ...,0,1
33,0,@USER oh my days can't wait for next week <3 n...,0,1
43,1,@USER Will do gobeithio aeth heddi'n good nos ...,0,2
46,1,Penblwydd Hapus @USER gobeithio geidi ddiwrnod...,0,3
...,...,...,...,...
79974,1,@USER chi'n dod laawr boxing day? Mam-gu yn do...,0,2
79980,0,"@USER yeeep boo, guesso ti ddim yn ysgol fyd?xx",0,1
79994,1,Newyddion ffug 😉 #yagym {URL},0,1
79997,0,mor hungover heddiw 🔫,1,0


In [16]:
# We then define a tweet to have positive sentiment if it contains at least one happy emoji/emoticon, 
# and no sad emoji/emoticon, and we define negative sentiment analogously.
# Therefore, we have to drop any rows which contain both happy and sad emojis.

conf_obs = list(df_clean[(df_clean['Happy_emoji'] > 0) & (df_clean['Sad_emoji'] > 0)].index)
print('The number of rows with both sad and happy emojis:', len(conf_obs))
data = df_clean.drop(conf_obs)
data

The number of rows with both sad and happy emojis: 116


Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji
8,1,"@USER mor browd, fe fyddai yn dangos ti faint ...",0,2
13,0,@USER hm dwi'r un peth! genai cur yn pen 24/7 ...,0,1
33,0,@USER oh my days can't wait for next week <3 n...,0,1
43,1,@USER Will do gobeithio aeth heddi'n good nos ...,0,2
46,1,Penblwydd Hapus @USER gobeithio geidi ddiwrnod...,0,3
...,...,...,...,...
79974,1,@USER chi'n dod laawr boxing day? Mam-gu yn do...,0,2
79980,0,"@USER yeeep boo, guesso ti ddim yn ysgol fyd?xx",0,1
79994,1,Newyddion ffug 😉 #yagym {URL},0,1
79997,0,mor hungover heddiw 🔫,1,0


In [17]:
# Splitting data to train and test

training_data = data.sample(frac=0.8, random_state=42)
testing_data = data.drop(training_data.index)

In [18]:
training_data

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji
51656,1,@USER hyna brilliant omg na dim wimp o gwbwl!!...,0,2
12194,1,@USER yeaaa fod yn au pair am flwyddyn sut ma ...,0,2
41110,0,@USER shwt ma pethe mas na? Met anyone famous ...,0,2
70473,0,@USER misso ti fyd rhaid mi neud rhywbeeeth! o...,0,1
74721,1,@USER neshi ddeutha hedd Wyn yn cae Gyna fysa ...,0,1
...,...,...,...,...
3224,0,Exam cynta' fory a DIM motivation i refeiso he...,2,0
142,1,@USER inooooo dwin yn gwbod sut I :P <3<3<3,0,1
7812,0,@USER just byta yn iach byth yn para efo fi ha...,0,1
64845,0,@USER aww babes dwi adra ers 6! Tisho dod lawr...,0,2


In [19]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df , test_size=10000, random_state=42)
train, val = train_test_split(train, test_size=10000, random_state=42)

29982

10000

20
20


40