In [None]:
# Data cleaning and getting ready for modeling
# Mohsen Eedloo

In [1]:
# Load libraries

import numpy as np
import pandas as pd
import csv

In [2]:
# Load data and inspection

df = pd.read_csv(r"D:\SLU\AI MSc\Fall 22\NLP\train-v2.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
df.head()

Unnamed: 0,0,1
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [4]:
# Rename columns

df.rename(columns={0:"Class", 1:"Document"}, inplace=True)
df.head()

Unnamed: 0,Class,Document
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...
2,0,@USER yeaah ma fe yn wir. ( oh well.
3,1,@USER hahaha idk. 3am oedd y bws ti?
4,0,@USER dwim yn gal llun ohoni?


In [5]:
# Add columns for having sad or happy emoji, and Label

df["Sad_emoji"] = None
df["Happy_emoji"] = None
df['Label'] = None
df.head()

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji,Label
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...,,,
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...,,,
2,0,@USER yeaah ma fe yn wir. ( oh well.,,,
3,1,@USER hahaha idk. 3am oedd y bws ti?,,,
4,0,@USER dwim yn gal llun ohoni?,,,


In [6]:
# Defining happy and sad emojis lists

happy_emojis = ["😀", "😃", "😄", "😁", "😆", "😅", "🤣", "😂", "🙂", "😉", "😊", "😇", "🥰", "😍", "🤩", "😘", "😗", "☺", "😚", "😙", "😋", "😛", "😜", 
"🤪", "😝", "🤗", "🤠", "🥳", "😎", "😺", "😸", "😹", "😻", "♥", "❤", "💕", "💖", "💙", "💛", "🧡", "💜", "💝", "👌", "🤟", "👊", '🍺', '💗'
"👍", "🤝", "🙏", "😬", "🤤", "💋", "❣", "✌", '<3', ':)', "👏", "🎉", '🎁', '💏', '🍰', '😏', '🙈', '👋', '🍸', '🎂', '💃', '💪🏻', '🌺', '🍷', '🍦']
sad_emojis = ["😐", "😶", "😑", "😒", "😔", "🤢", "🤮", "😕", "😖", "🥺", "😫", "😞", "😭", "😟", "😓", "😩", "😢", "🙁", "😥", 
"😣", "☹", "😰", "😤", "😡", "🤬", "😠", "💩", "😿", "😾", "🖤", "💔", "🖕", "👎", "🤦", '🚫', '💀', '🔫', ':(', '🙃']
print('Number of defined happy emojis:', len(happy_emojis))
print('Number of defined sad emojis:', len(sad_emojis))

Number of defined happy emojis: 71
Number of defined sad emojis: 39


In [7]:
# Defining two classes of "Happy" and "Sad" by emojis

Classes = {"Happy": happy_emojis, "Sad": sad_emojis}

In [8]:
len(df)

80000

In [9]:
# Finding all "Happy" and "Sad" emojis in documents

for i in range(len(df)):
    num_sad, num_happy = 0, 0   # Setting emoji counters to 0
    for h in range(len(Classes['Happy'])):  # check how many "Happy" emojis are in document
        if Classes['Happy'][h] in df.loc[i, 'Document']:
            num_happy += 1
    for s in range(len(Classes['Sad'])):    # check how many "Sad" emojis are in document
        if Classes['Sad'][s] in df.loc[i, 'Document']:
            num_sad += 1
    df.loc[i, 'Happy_emoji'] = num_happy
    df.loc[i, 'Sad_emoji'] = num_sad
df.head()

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji,Label
0,0,@USER @USER a sicrhau bod mwy o arian poced 'd...,0,0,
1,1,Parti Dolig da gyda tim swyddfa canolog @USER ...,0,0,
2,0,@USER yeaah ma fe yn wir. ( oh well.,0,0,
3,1,@USER hahaha idk. 3am oedd y bws ti?,0,0,
4,0,@USER dwim yn gal llun ohoni?,0,0,


In [10]:
len(df)

80000

In [11]:
# Save dataframe

df.to_csv('dataframe', sep='\t', header=True, index=False, encoding='utf-8')

In [12]:
# Check the number of observations per class

print('Number of observation with sad emoji:', (len(df[df['Sad_emoji']>0])))
print('Number of observation with happy emoji:', (len(df[df['Happy_emoji']>0])))

Number of observation with sad emoji: 996
Number of observation with happy emoji: 3183


In [16]:
# Dropping rows without emojis

df_clean = df.drop(df[(df['Sad_emoji'] == 0) & (df['Happy_emoji'] == 0)].index)
df_clean

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji,Label
33,0,@USER oh my days can't wait for next week <3 n...,0,1,
46,1,Penblwydd Hapus @USER gobeithio geidi ddiwrnod...,0,3,
55,0,Angen byclo fyny a sortio'n hyn allan. Failure 👎,1,0,
64,0,@USER dwi'n shwr bona'm six nations bl nesa ac...,0,3,
67,0,@USER pwy gai rhedag at efo fo mhroblema i gyd...,0,1,
...,...,...,...,...,...
79892,0,❤ @USER 🎤🎶 voice of an 😇. 👏👏. Lews weth ei fod...,0,3,
79907,1,Blwyddyn newydd dda! Happy new year! Bliadhna ...,0,2,
79994,1,Newyddion ffug 😉 #yagym {URL},0,1,
79997,0,mor hungover heddiw 🔫,1,0,


In [21]:
# We then define a tweet to have positive sentiment if it contains at least one happy emoji/emoticon, 
# and no sad emoji/emoticon, and we define negative sentiment analogously.
# Therefore, we have to drop any rows which contain both happy and sad emojis.
# So, each observation should only has value over 0 on "Happy_emoji" or "Sad_emoji", but not in both at the same time.

conf_obs = list(df_clean[(df_clean['Happy_emoji'] > 0) & (df_clean['Sad_emoji'] > 0)].index)
print('The number of rows with both sad and happy emojis:', len(conf_obs))
data = df_clean.drop(conf_obs)
data

The number of rows with both sad and happy emojis: 94


Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji,Label
33,0,@USER oh my days can't wait for next week <3 n...,0,1,
46,1,Penblwydd Hapus @USER gobeithio geidi ddiwrnod...,0,3,
55,0,Angen byclo fyny a sortio'n hyn allan. Failure 👎,1,0,
64,0,@USER dwi'n shwr bona'm six nations bl nesa ac...,0,3,
67,0,@USER pwy gai rhedag at efo fo mhroblema i gyd...,0,1,
...,...,...,...,...,...
79892,0,❤ @USER 🎤🎶 voice of an 😇. 👏👏. Lews weth ei fod...,0,3,
79907,1,Blwyddyn newydd dda! Happy new year! Bliadhna ...,0,2,
79994,1,Newyddion ffug 😉 #yagym {URL},0,1,
79997,0,mor hungover heddiw 🔫,1,0,


In [23]:
# We are not going to use Class column as label. Labeling should be based on emojis.
# Setting Label column based on happy and sad emojis.
# In Label column, we set 1 if document contains happy emoji and 0 vice versa.

index_list = list(data.index)
for i in range(len(index_list)):
    if data.loc[index_list[i], 'Happy_emoji'] > data.loc[index_list[i], 'Sad_emoji']:
        data.loc[index_list[i], 'Label'] = 1
    else:
        data.loc[index_list[i], 'Label'] = 0
data

Unnamed: 0,Class,Document,Sad_emoji,Happy_emoji,Label
33,0,@USER oh my days can't wait for next week <3 n...,0,1,1
46,1,Penblwydd Hapus @USER gobeithio geidi ddiwrnod...,0,3,1
55,0,Angen byclo fyny a sortio'n hyn allan. Failure 👎,1,0,0
64,0,@USER dwi'n shwr bona'm six nations bl nesa ac...,0,3,1
67,0,@USER pwy gai rhedag at efo fo mhroblema i gyd...,0,1,1
...,...,...,...,...,...
79892,0,❤ @USER 🎤🎶 voice of an 😇. 👏👏. Lews weth ei fod...,0,3,1
79907,1,Blwyddyn newydd dda! Happy new year! Bliadhna ...,0,2,1
79994,1,Newyddion ffug 😉 #yagym {URL},0,1,1
79997,0,mor hungover heddiw 🔫,1,0,0


In [24]:
# Splitting data to train and test

training_data = data.sample(frac=0.8, random_state=42)
testing_data = data.drop(training_data.index)

In [27]:
# Saving data and train and test

training_data.to_csv('train_data', sep='\t', header=True, index=False, encoding='utf-8')
testing_data.to_csv('test_data', sep='\t', header=True, index=False, encoding='utf-8')
data.to_csv('data', sep='\t', header=True, index=False, encoding='utf-8')
