### **ENVIRONMENT SETUP**

In [1]:
! pip install -q emoji

[?25l[K     |█▍                              | 10 kB 16.0 MB/s eta 0:00:01[K     |██▊                             | 20 kB 9.2 MB/s eta 0:00:01[K     |████                            | 30 kB 12.3 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 4.8 MB/s eta 0:00:01[K     |██████▉                         | 51 kB 4.8 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 5.6 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 5.6 MB/s eta 0:00:01[K     |████████████▎                   | 92 kB 6.1 MB/s eta 0:00:01[K     |█████████████▋                  | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████                 | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████████▎               | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████████▊              | 133 kB 5.3 MB/s eta 0:00:01[K     |███████████████████             | 143 kB 5.3 MB/s eta 0:00:01[K   

In [2]:
%cd /content/
! git clone https://github.com/srivarshan-s/understanding-emojis-in-tamil-emotion-detection.git
%cd understanding-emojis-in-tamil-emotion-detection/

/content
Cloning into 'understanding-emojis-in-tamil-emotion-detection'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 109 (delta 24), reused 104 (delta 23), pack-reused 0[K
Receiving objects: 100% (109/109), 1.74 MiB | 9.37 MiB/s, done.
Resolving deltas: 100% (24/24), done.
/content/understanding-emojis-in-tamil-emotion-detection


### **IMPORT LIBRARIES**

In [3]:
import numpy as np
import pandas as pd
import emoji
import re

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

### **STEMMER**

In [4]:
! rm input.txt
! rm output.txt
%cd snowball-with-tamil/
! make
%cd ..

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory
/content/understanding-emojis-in-tamil-emotion-detection/snowball-with-tamil
cc -Iinclude -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations  -c -o compiler/space.o compiler/space.c
cc -Iinclude -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations  -c -o compiler/tokeniser.o compiler/tokeniser.c
In file included from [01m[Kcompiler/tokeniser.c:6:0[m[K:
[01m[Kcompiler/tokeniser.c:[m[K In function ‘[01m[Kread_token[m[K’:
 #define unless(C) if[01;35m[K([m[K!(C))
                     [01;35m[K^[m[K
[01m[Kcompiler/tokeniser.c:390:16:[m[K [01;36m[Knote: [m[Kin expansion of macro ‘[01m[Kunless[m[K’
                [01;36m[Kunless[m[K (t->next == 0) {
                [01;36m[K^~~~~~[m[K
[01m[Kcompiler/tokeniser.c:401:13:[m[K [01;36m[Knote: [m[Khere
             [01;36m[Kdefault[m[K:
             [01;36m[K^~~~~~~[m[K

### **IMPORT DATASET**

In [5]:
header_names = ["emotion", "text"]
df_train = pd.read_csv("data/ta-emotion10-train.csv", sep="\t", names=header_names)
df_dev = pd.read_csv("data/ta-emotion10-dev.csv", sep="\t", names=header_names)
df_test = pd.read_csv("data/task_a_test.csv", sep="\t", names=header_names)

In [6]:
df = pd.concat([df_train, df_dev, df_test])

In [7]:
df.head()

Unnamed: 0,emotion,text
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...


In [8]:
df.describe()

Unnamed: 0,emotion,text
count,22200,22200
unique,11,22200
top,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
freq,7601,1


In [9]:
df.emotion.unique()

array(['Neutral', 'Anger', 'Joy', 'Disguist', 'Trust', 'Anticipation',
       'Ambiguous', 'Love', 'Surprise', 'Sadness', 'Fear'], dtype=object)

In [10]:
pd.value_counts(df.emotion)

Neutral         7601
Joy             3394
Ambiguous       2626
Trust           1903
Disguist        1397
Anticipation    1312
Anger           1262
Sadness         1127
Love            1060
Surprise         362
Fear             156
Name: emotion, dtype: int64

### **DATA CLEANING**

In [11]:
# Remove rows without emojis

drop_idx = []

for text, idx in zip(df.text, df.index):
    if len(emoji.distinct_emoji_list(text)) == 0:
        drop_idx.append(idx)

df.drop(df.index[drop_idx], inplace=True)

df.reset_index(inplace=True, drop=True)

In [12]:
df.describe()

Unnamed: 0,emotion,text
count,1818,1818
unique,11,1818
top,Joy,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
freq,585,1


In [13]:
pd.value_counts(df.emotion)

Joy             585
Neutral         401
Trust           183
Love            143
Ambiguous       139
Sadness         120
Anticipation     73
Disguist         69
Anger            59
Surprise         34
Fear             12
Name: emotion, dtype: int64

In [14]:
df.head()

Unnamed: 0,emotion,text
0,Surprise,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
1,Ambiguous,யாருக்கு தெரியும் பொண்ணு பார்க்க கூட குடும்பத்...
2,Ambiguous,அது என்ன 🔥பனியிடை நீக்கம் பனி நீக்கம் தான் செறி 👍
3,Neutral,தி மு க விற்க்கு எனது 7 கோடி நன்றிகள்... அந்த ...
4,Love,கணவன் அமைவதெல்லாம் இறைவன் கொடுத்த வரம் ❤️


### **DATA PREPROCESSING**

In [15]:
text = df['text']
label = df['emotion']

In [16]:
le = LabelEncoder()
label = le.fit_transform(label)

In [17]:
text = text.str.replace(r"[+/#@&*$%:]", '', regex=True)
text = text.to_numpy()

In [18]:
text = text.tolist()

! rm input.txt
! rm output.txt

for i in text:
    text_file = open("input.txt", "a")
    text_file.write(i + '\n')
    text_file.close()

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory


In [19]:
! chmod +x snowball-with-tamil/stemwords
! ./snowball-with-tamil/stemwords -l ta -i input.txt -o output.txt

In [20]:
text_file = open("output.txt", "r")

text = []

for line in text_file:
    text.append(line.strip())

In [21]:
with open('stopwords/tamil_stopwords.txt', encoding = 'utf-8') as f:
    tamil_stopwords = f.readlines()
    for i in range(len(tamil_stopwords)):
        tamil_stopwords[i] = re.sub('\n','',tamil_stopwords[i])
stopwords = tamil_stopwords

In [22]:
# Function for removing stop words
def stopwords_remove(text):
    # Appending words which are not stop words
    text = text.split(" ")
    removed = [s for s in text if s not in stopwords]  
    return removed

text = [stopwords_remove(s) for s in text]
for i in range(len(text)):
  text[i] = (" ".join(text[i]))

text = np.array(text)

### **FEATURE EXTRACTION**

In [23]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(text)
X = X.toarray()
print(X.shape)

y = label
print(y.shape)

(1818, 318)
(1818,)


### **LOGISTIC REGRESSION**

In [24]:
# # Gridsearch

# parameters = {
#     "penalty": ["l1", "l2", "elasticnet", "none"],
#     "dual": [True, False],
#     "C": [1, 0.1, 0.01],
#     "fit_intercept": [True, False],
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
# }

# model = LogisticRegression()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1)
# grid_search.fit(X, y)

# grid_search.best_params_

In [25]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LogisticRegression()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.35      0.70      0.47       585
           6       0.17      0.02      0.04       143
           7       0.21      0.28      0.24       401
           8       0.28      0.06      0.10       120
           9       0.00      0.00      0.00        34
          10       0.34      0.13      0.19       183

    accuracy                           0.31      1818
   macro avg       0.12      0.11      0.09      1818
weighted avg       0.23      0.31      0.23      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **SUPPORT VECTOR MACHINE**

In [26]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = SVC()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.34      0.77      0.47       585
           6       0.15      0.01      0.03       143
           7       0.22      0.23      0.22       401
           8       0.42      0.04      0.08       120
           9       0.00      0.00      0.00        34
          10       0.28      0.07      0.11       183

    accuracy                           0.31      1818
   macro avg       0.13      0.10      0.08      1818
weighted avg       0.23      0.31      0.22      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **NAIVE BAYES**

In [27]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = GaussianNB()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.11      0.09      0.10       139
           1       0.06      0.22      0.10        59
           2       0.07      0.19      0.10        73
           3       0.08      0.16      0.11        69
           4       0.01      0.08      0.02        12
           5       0.24      0.05      0.08       585
           6       0.14      0.34      0.19       143
           7       0.19      0.07      0.10       401
           8       0.20      0.20      0.20       120
           9       0.06      0.29      0.10        34
          10       0.17      0.13      0.15       183

    accuracy                           0.12      1818
   macro avg       0.12      0.17      0.11      1818
weighted avg       0.18      0.12      0.11      1818



### **STOCHASTIC GRADIENT DESCENT**

In [28]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = SGDClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.10      0.06      0.08       139
           1       0.11      0.08      0.10        59
           2       0.00      0.00      0.00        73
           3       0.13      0.14      0.14        69
           4       0.08      0.08      0.08        12
           5       0.39      0.56      0.46       585
           6       0.07      0.03      0.05       143
           7       0.23      0.19      0.21       401
           8       0.24      0.23      0.23       120
           9       0.08      0.06      0.07        34
          10       0.21      0.20      0.21       183

    accuracy                           0.27      1818
   macro avg       0.15      0.15      0.15      1818
weighted avg       0.24      0.27      0.25      1818



### **K NEAREST NEIGHBOURS**

In [29]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = KNeighborsClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.07      0.11      0.08       139
           1       0.08      0.08      0.08        59
           2       0.08      0.01      0.02        73
           3       0.22      0.06      0.09        69
           4       0.00      0.00      0.00        12
           5       0.37      0.55      0.44       585
           6       0.19      0.07      0.10       143
           7       0.23      0.25      0.24       401
           8       0.19      0.04      0.07       120
           9       0.00      0.00      0.00        34
          10       0.18      0.07      0.10       183

    accuracy                           0.26      1818
   macro avg       0.15      0.11      0.11      1818
weighted avg       0.23      0.26      0.23      1818



### **DECISION TREE**

In [30]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.15      0.13      0.14       139
           1       0.08      0.07      0.07        59
           2       0.02      0.01      0.01        73
           3       0.06      0.04      0.05        69
           4       0.10      0.08      0.09        12
           5       0.39      0.49      0.43       585
           6       0.10      0.08      0.09       143
           7       0.22      0.24      0.23       401
           8       0.12      0.10      0.11       120
           9       0.10      0.06      0.07        34
          10       0.18      0.14      0.16       183

    accuracy                           0.25      1818
   macro avg       0.14      0.13      0.13      1818
weighted avg       0.23      0.25      0.24      1818



### **RANDOM FOREST**

In [31]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.15      0.06      0.09       139
           1       0.05      0.02      0.02        59
           2       0.00      0.00      0.00        73
           3       0.07      0.01      0.02        69
           4       0.00      0.00      0.00        12
           5       0.36      0.62      0.46       585
           6       0.08      0.03      0.04       143
           7       0.23      0.29      0.25       401
           8       0.30      0.15      0.20       120
           9       0.00      0.00      0.00        34
          10       0.17      0.08      0.10       183

    accuracy                           0.29      1818
   macro avg       0.13      0.11      0.11      1818
weighted avg       0.23      0.29      0.24      1818



### **XGBOOST**

In [32]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.14      0.04      0.07       139
           1       0.10      0.02      0.03        59
           2       0.00      0.00      0.00        73
           3       0.20      0.01      0.03        69
           4       0.00      0.00      0.00        12
           5       0.35      0.78      0.48       585
           6       0.09      0.02      0.03       143
           7       0.26      0.20      0.22       401
           8       0.29      0.07      0.11       120
           9       0.00      0.00      0.00        34
          10       0.23      0.09      0.13       183

    accuracy                           0.31      1818
   macro avg       0.15      0.11      0.10      1818
weighted avg       0.24      0.31      0.23      1818

