### **ENVIRONMENT SETUP**

In [23]:
! pip install -q emoji transformers

[K     |████████████████████████████████| 5.5 MB 5.2 MB/s 
[K     |████████████████████████████████| 182 kB 66.5 MB/s 
[K     |████████████████████████████████| 7.6 MB 40.7 MB/s 
[?25h

In [2]:
%cd /content/
! git clone https://github.com/srivarshan-s/understanding-emojis-in-tamil-emotion-detection.git
%cd understanding-emojis-in-tamil-emotion-detection/

/content
Cloning into 'understanding-emojis-in-tamil-emotion-detection'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (138/138), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 138 (delta 40), reused 132 (delta 39), pack-reused 0[K
Receiving objects: 100% (138/138), 1.77 MiB | 11.14 MiB/s, done.
Resolving deltas: 100% (40/40), done.
/content/understanding-emojis-in-tamil-emotion-detection


### **IMPORT LIBRARIES**

In [52]:
import numpy as np
import pandas as pd
import emoji
import re
from tqdm import tqdm

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from transformers import AutoTokenizer, AutoModel

from sklearn.metrics import classification_report

### **STEMMER**

In [None]:
! rm input.txt
! rm output.txt
%cd snowball-with-tamil/
! make
%cd ..

### **IMPORT DATASET**

In [5]:
header_names = ["emotion", "text"]
df_train = pd.read_csv("data/ta-emotion10-train.csv", sep="\t", names=header_names)
df_dev = pd.read_csv("data/ta-emotion10-dev.csv", sep="\t", names=header_names)
df_test = pd.read_csv("data/task_a_test.csv", sep="\t", names=header_names)

In [6]:
df = pd.concat([df_train, df_dev, df_test])

In [7]:
df.head()

Unnamed: 0,emotion,text
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...


In [8]:
df.describe()

Unnamed: 0,emotion,text
count,22200,22200
unique,11,22200
top,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
freq,7601,1


In [9]:
df.emotion.unique()

array(['Neutral', 'Anger', 'Joy', 'Disguist', 'Trust', 'Anticipation',
       'Ambiguous', 'Love', 'Surprise', 'Sadness', 'Fear'], dtype=object)

In [10]:
pd.value_counts(df.emotion)

Neutral         7601
Joy             3394
Ambiguous       2626
Trust           1903
Disguist        1397
Anticipation    1312
Anger           1262
Sadness         1127
Love            1060
Surprise         362
Fear             156
Name: emotion, dtype: int64

### **DATA CLEANING**

In [11]:
# Remove rows without emojis

drop_idx = []

for text, idx in zip(df.text, df.index):
    if len(emoji.distinct_emoji_list(text)) == 0:
        drop_idx.append(idx)

df.drop(df.index[drop_idx], inplace=True)

df.reset_index(inplace=True, drop=True)

In [12]:
df.describe()

Unnamed: 0,emotion,text
count,1818,1818
unique,11,1818
top,Joy,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
freq,585,1


In [13]:
pd.value_counts(df.emotion)

Joy             585
Neutral         401
Trust           183
Love            143
Ambiguous       139
Sadness         120
Anticipation     73
Disguist         69
Anger            59
Surprise         34
Fear             12
Name: emotion, dtype: int64

In [14]:
df.head()

Unnamed: 0,emotion,text
0,Surprise,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
1,Ambiguous,யாருக்கு தெரியும் பொண்ணு பார்க்க கூட குடும்பத்...
2,Ambiguous,அது என்ன 🔥பனியிடை நீக்கம் பனி நீக்கம் தான் செறி 👍
3,Neutral,தி மு க விற்க்கு எனது 7 கோடி நன்றிகள்... அந்த ...
4,Love,கணவன் அமைவதெல்லாம் இறைவன் கொடுத்த வரம் ❤️


### **DATA PREPROCESSING**

In [15]:
text = df['text']
label = df['emotion']

In [16]:
le = LabelEncoder()
label = le.fit_transform(label)

In [17]:
text = text.str.replace(r"[+/#@&*$%:]", '', regex=True)
text = text.to_numpy()

In [18]:
text = text.tolist()

! rm input.txt
! rm output.txt

for i in text:
    text_file = open("input.txt", "a")
    text_file.write(i + '\n')
    text_file.close()

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory


In [19]:
! chmod +x snowball-with-tamil/stemwords
! ./snowball-with-tamil/stemwords -l ta -i input.txt -o output.txt

In [20]:
text_file = open("output.txt", "r")

text = []

for line in text_file:
    text.append(line.strip())

In [21]:
with open('stopwords/tamil_stopwords.txt', encoding = 'utf-8') as f:
    tamil_stopwords = f.readlines()
    for i in range(len(tamil_stopwords)):
        tamil_stopwords[i] = re.sub('\n','',tamil_stopwords[i])
stopwords = tamil_stopwords

In [22]:
# Function for removing stop words
def stopwords_remove(text):
    # Appending words which are not stop words
    text = text.split(" ")
    removed = [s for s in text if s not in stopwords]  
    return removed

text = [stopwords_remove(s) for s in text]
for i in range(len(text)):
  text[i] = (" ".join(text[i]))

text = np.array(text)

### **FEATURE EXTRACTION**

In [65]:
y = label
print(y.shape)

(1818,)


In [None]:
# Load Transformer Model

tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModel.from_pretrained("google/muril-base-cased")

In [67]:
# Sample Output

tokenized_input = tokenizer(
        text[0],
        padding=True,
        truncation=False,
        return_tensors='pt'
    )

input_ids = tokenized_input["input_ids"]
token_type_ids = tokenized_input["token_type_ids"]
attention_mask = tokenized_input["attention_mask"]

# _, features = model(input_ids, attention_mask, token_type_ids)
_, features = model(**tokenized_input, return_dict=False)

In [68]:
features.cpu().detach().numpy().shape

(1, 768)

In [69]:
# Tokenizing Input Data

input = []

for i in text:
    tokenized_input = tokenizer(
        i,
        padding=True,
        truncation=False,
        return_tensors='pt'
    )
    input.append(tokenized_input)

In [70]:
# Embedding the Input Data

X = []

for x in tqdm(input):
    
    _, model_output = model(**x, return_dict=False)
    X.append(model_output.cpu().detach().numpy())

X = np.array(X)
X = X.reshape(X.shape[0], X.shape[2])

100%|██████████| 1818/1818 [03:38<00:00,  8.33it/s]


### **LOGISTIC REGRESSION**

In [75]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LogisticRegression()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.32      1.00      0.49       585
           6       0.00      0.00      0.00       143
           7       0.00      0.00      0.00       401
           8       0.00      0.00      0.00       120
           9       0.00      0.00      0.00        34
          10       0.00      0.00      0.00       183

    accuracy                           0.32      1818
   macro avg       0.03      0.09      0.04      1818
weighted avg       0.10      0.32      0.16      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **RANDOM FOREST**

In [76]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.23      0.02      0.04       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.36      0.77      0.49       585
           6       0.09      0.01      0.01       143
           7       0.22      0.29      0.25       401
           8       0.67      0.02      0.03       120
           9       0.00      0.00      0.00        34
          10       0.23      0.03      0.05       183

    accuracy                           0.32      1818
   macro avg       0.16      0.10      0.08      1818
weighted avg       0.26      0.32      0.23      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **XGBOOST**

In [77]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.24      0.09      0.13       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.20      0.04      0.07        69
           4       0.00      0.00      0.00        12
           5       0.36      0.68      0.47       585
           6       0.09      0.01      0.02       143
           7       0.22      0.27      0.24       401
           8       0.15      0.03      0.05       120
           9       0.00      0.00      0.00        34
          10       0.19      0.09      0.12       183

    accuracy                           0.30      1818
   macro avg       0.13      0.11      0.10      1818
weighted avg       0.23      0.30      0.24      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
