### **ENVIRONMENT SETUP**

In [1]:
! pip install -q emoji transformers

[K     |████████████████████████████████| 240 kB 23.1 MB/s 
[K     |████████████████████████████████| 5.5 MB 83.2 MB/s 
[K     |████████████████████████████████| 7.6 MB 64.4 MB/s 
[K     |████████████████████████████████| 182 kB 93.0 MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [2]:
%cd /content/
! git clone https://github.com/srivarshan-s/understanding-emojis-in-tamil-emotion-detection.git
%cd understanding-emojis-in-tamil-emotion-detection/

/content
Cloning into 'understanding-emojis-in-tamil-emotion-detection'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (138/138), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 138 (delta 40), reused 132 (delta 39), pack-reused 0[K
Receiving objects: 100% (138/138), 1.77 MiB | 22.69 MiB/s, done.
Resolving deltas: 100% (40/40), done.
/content/understanding-emojis-in-tamil-emotion-detection


### **IMPORT LIBRARIES**

In [3]:
import numpy as np
import pandas as pd
import emoji
import re
from tqdm import tqdm

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import classification_report

### **STEMMER**

In [4]:
! rm input.txt
! rm output.txt
%cd snowball-with-tamil/
! make
%cd ..

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory
/content/understanding-emojis-in-tamil-emotion-detection/snowball-with-tamil
cc -Iinclude -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations  -c -o compiler/space.o compiler/space.c
cc -Iinclude -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations  -c -o compiler/tokeniser.o compiler/tokeniser.c
In file included from [01m[Kcompiler/tokeniser.c:6:0[m[K:
[01m[Kcompiler/tokeniser.c:[m[K In function ‘[01m[Kread_token[m[K’:
 #define unless(C) if[01;35m[K([m[K!(C))
                     [01;35m[K^[m[K
[01m[Kcompiler/tokeniser.c:390:16:[m[K [01;36m[Knote: [m[Kin expansion of macro ‘[01m[Kunless[m[K’
                [01;36m[Kunless[m[K (t->next == 0) {
                [01;36m[K^~~~~~[m[K
[01m[Kcompiler/tokeniser.c:401:13:[m[K [01;36m[Knote: [m[Khere
             [01;36m[Kdefault[m[K:
             [01;36m[K^~~~~~~[m[K

### **IMPORT DATASET**

In [5]:
header_names = ["emotion", "text"]
df_train = pd.read_csv("data/ta-emotion10-train.csv", sep="\t", names=header_names)
df_dev = pd.read_csv("data/ta-emotion10-dev.csv", sep="\t", names=header_names)
df_test = pd.read_csv("data/task_a_test.csv", sep="\t", names=header_names)

In [6]:
df = pd.concat([df_train, df_dev, df_test])

In [7]:
df.head()

Unnamed: 0,emotion,text
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...


In [8]:
df.describe()

Unnamed: 0,emotion,text
count,22200,22200
unique,11,22200
top,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
freq,7601,1


In [9]:
df.emotion.unique()

array(['Neutral', 'Anger', 'Joy', 'Disguist', 'Trust', 'Anticipation',
       'Ambiguous', 'Love', 'Surprise', 'Sadness', 'Fear'], dtype=object)

In [10]:
pd.value_counts(df.emotion)

Neutral         7601
Joy             3394
Ambiguous       2626
Trust           1903
Disguist        1397
Anticipation    1312
Anger           1262
Sadness         1127
Love            1060
Surprise         362
Fear             156
Name: emotion, dtype: int64

### **DATA CLEANING**

In [11]:
# Remove rows without emojis

drop_idx = []

for text, idx in zip(df.text, df.index):
    if len(emoji.distinct_emoji_list(text)) == 0:
        drop_idx.append(idx)

df.drop(df.index[drop_idx], inplace=True)

df.reset_index(inplace=True, drop=True)

In [12]:
df.describe()

Unnamed: 0,emotion,text
count,1818,1818
unique,11,1818
top,Joy,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
freq,585,1


In [13]:
pd.value_counts(df.emotion)

Joy             585
Neutral         401
Trust           183
Love            143
Ambiguous       139
Sadness         120
Anticipation     73
Disguist         69
Anger            59
Surprise         34
Fear             12
Name: emotion, dtype: int64

In [14]:
df.head()

Unnamed: 0,emotion,text
0,Surprise,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
1,Ambiguous,யாருக்கு தெரியும் பொண்ணு பார்க்க கூட குடும்பத்...
2,Ambiguous,அது என்ன 🔥பனியிடை நீக்கம் பனி நீக்கம் தான் செறி 👍
3,Neutral,தி மு க விற்க்கு எனது 7 கோடி நன்றிகள்... அந்த ...
4,Love,கணவன் அமைவதெல்லாம் இறைவன் கொடுத்த வரம் ❤️


In [15]:
# Remove emoji from text

def remove_emoji(text):
    return emoji.replace_emoji(string=text, replace="")

df["text"] = df["text"].apply(remove_emoji)

In [16]:
df.head()

Unnamed: 0,emotion,text
0,Surprise,அண்ணே இங்கேயும் வந்துட்டீங்களா
1,Ambiguous,யாருக்கு தெரியும் பொண்ணு பார்க்க கூட குடும்பத்...
2,Ambiguous,அது என்ன பனியிடை நீக்கம் பனி நீக்கம் தான் செறி
3,Neutral,தி மு க விற்க்கு எனது 7 கோடி நன்றிகள்... அந்த ...
4,Love,கணவன் அமைவதெல்லாம் இறைவன் கொடுத்த வரம்


### **DATA PREPROCESSING**

In [17]:
text = df['text']
label = df['emotion']

In [18]:
le = LabelEncoder()
label = le.fit_transform(label)

In [19]:
text = text.str.replace(r"[+/#@&*$%:]", '', regex=True)
text = text.to_numpy()

In [20]:
text = text.tolist()

! rm input.txt
! rm output.txt

for i in text:
    text_file = open("input.txt", "a")
    text_file.write(i + '\n')
    text_file.close()

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory


In [21]:
! chmod +x snowball-with-tamil/stemwords
! ./snowball-with-tamil/stemwords -l ta -i input.txt -o output.txt

In [22]:
text_file = open("output.txt", "r")

text = []

for line in text_file:
    text.append(line.strip())

In [23]:
with open('stopwords/tamil_stopwords.txt', encoding = 'utf-8') as f:
    tamil_stopwords = f.readlines()
    for i in range(len(tamil_stopwords)):
        tamil_stopwords[i] = re.sub('\n','',tamil_stopwords[i])
stopwords = tamil_stopwords

In [24]:
# Function for removing stop words
def stopwords_remove(text):
    # Appending words which are not stop words
    text = text.split(" ")
    removed = [s for s in text if s not in stopwords]  
    return removed

text = [stopwords_remove(s) for s in text]
for i in range(len(text)):
  text[i] = (" ".join(text[i]))

text = np.array(text)

### **FEATURE EXTRACTION**

In [25]:
y = label
print(y.shape)

(1818,)


In [26]:
# Load Transformer Model

tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModel.from_pretrained("google/muril-base-cased")

Downloading:   0%|          | 0.00/206 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/muril-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

input_ids = tokenized_input["input_ids"]
token_type_ids = tokenized_input["token_type_ids"]
attention_mask = tokenized_input["attention_mask"]

# _, features = model(input_ids, attention_mask, token_type_ids)
_, features = model(**tokenized_input, return_dict=False)

In [28]:
features.cpu().detach().numpy().shape

(1, 768)

In [29]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [30]:
# Embedding the Input Data

X = []

for x in tqdm(input):
    
    _, model_output = model(**x, return_dict=False)
    X.append(model_output.cpu().detach().numpy())

X = np.array(X)
X = X.reshape(X.shape[0], X.shape[2])

100%|██████████| 1818/1818 [04:38<00:00,  6.54it/s]


### **LOGISTIC REGRESSION**

In [31]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LogisticRegression()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.32      1.00      0.49       585
           6       0.00      0.00      0.00       143
           7       0.00      0.00      0.00       401
           8       0.00      0.00      0.00       120
           9       0.00      0.00      0.00        34
          10       0.00      0.00      0.00       183

    accuracy                           0.32      1818
   macro avg       0.03      0.09      0.04      1818
weighted avg       0.10      0.32      0.16      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **RANDOM FOREST**

In [32]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.07      0.01      0.01       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.34      0.76      0.47       585
           6       0.12      0.01      0.03       143
           7       0.20      0.23      0.22       401
           8       1.00      0.01      0.02       120
           9       0.00      0.00      0.00        34
          10       0.25      0.03      0.06       183

    accuracy                           0.30      1818
   macro avg       0.18      0.10      0.07      1818
weighted avg       0.26      0.30      0.21      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **XGBOOST**

In [33]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.20      0.07      0.11       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.25      0.06      0.09        69
           4       0.00      0.00      0.00        12
           5       0.36      0.69      0.47       585
           6       0.23      0.06      0.09       143
           7       0.25      0.31      0.28       401
           8       0.13      0.03      0.04       120
           9       0.00      0.00      0.00        34
          10       0.25      0.09      0.13       183

    accuracy                           0.31      1818
   macro avg       0.15      0.12      0.11      1818
weighted avg       0.25      0.31      0.25      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
