### **ENVIRONMENT SETUP**

In [1]:
! pip install -q emoji

[?25l[K     |█▍                              | 10 kB 20.4 MB/s eta 0:00:01[K     |██▊                             | 20 kB 7.1 MB/s eta 0:00:01[K     |████                            | 30 kB 9.9 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 4.6 MB/s eta 0:00:01[K     |██████▉                         | 51 kB 4.8 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 5.7 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 5.6 MB/s eta 0:00:01[K     |████████████▎                   | 92 kB 6.2 MB/s eta 0:00:01[K     |█████████████▋                  | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████                 | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████████▎               | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████████▊              | 133 kB 5.3 MB/s eta 0:00:01[K     |███████████████████             | 143 kB 5.3 MB/s eta 0:00:01[K    

In [2]:
%cd /content/
! git clone https://github.com/srivarshan-s/understanding-emojis-in-tamil-emotion-detection.git
%cd understanding-emojis-in-tamil-emotion-detection/

/content
Cloning into 'understanding-emojis-in-tamil-emotion-detection'...
remote: Enumerating objects: 122, done.[K
remote: Counting objects: 100% (122/122), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 122 (delta 29), reused 116 (delta 28), pack-reused 0[K
Receiving objects: 100% (122/122), 1.76 MiB | 2.35 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/understanding-emojis-in-tamil-emotion-detection


### **IMPORT LIBRARIES**

In [3]:
import numpy as np
import pandas as pd
import emoji
import re

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

### **STEMMER**

In [4]:
! rm input.txt
! rm output.txt
%cd snowball-with-tamil/
! make
%cd ..

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory
/content/understanding-emojis-in-tamil-emotion-detection/snowball-with-tamil
cc -Iinclude -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations  -c -o compiler/space.o compiler/space.c
cc -Iinclude -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations  -c -o compiler/tokeniser.o compiler/tokeniser.c
In file included from [01m[Kcompiler/tokeniser.c:6:0[m[K:
[01m[Kcompiler/tokeniser.c:[m[K In function ‘[01m[Kread_token[m[K’:
 #define unless(C) if[01;35m[K([m[K!(C))
                     [01;35m[K^[m[K
[01m[Kcompiler/tokeniser.c:390:16:[m[K [01;36m[Knote: [m[Kin expansion of macro ‘[01m[Kunless[m[K’
                [01;36m[Kunless[m[K (t->next == 0) {
                [01;36m[K^~~~~~[m[K
[01m[Kcompiler/tokeniser.c:401:13:[m[K [01;36m[Knote: [m[Khere
             [01;36m[Kdefault[m[K:
             [01;36m[K^~~~~~~[m[K

### **IMPORT DATASET**

In [5]:
header_names = ["emotion", "text"]
df_train = pd.read_csv("data/ta-emotion10-train.csv", sep="\t", names=header_names)
df_dev = pd.read_csv("data/ta-emotion10-dev.csv", sep="\t", names=header_names)
df_test = pd.read_csv("data/task_a_test.csv", sep="\t", names=header_names)

In [6]:
# 17760
df = pd.concat([df_train, df_dev, df_test])

In [7]:
df.head()

Unnamed: 0,emotion,text
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...


In [8]:
df.describe()

Unnamed: 0,emotion,text
count,22200,22200
unique,11,22200
top,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
freq,7601,1


In [9]:
df.emotion.unique()

array(['Neutral', 'Anger', 'Joy', 'Disguist', 'Trust', 'Anticipation',
       'Ambiguous', 'Love', 'Surprise', 'Sadness', 'Fear'], dtype=object)

In [10]:
pd.value_counts(df.emotion)

Neutral         7601
Joy             3394
Ambiguous       2626
Trust           1903
Disguist        1397
Anticipation    1312
Anger           1262
Sadness         1127
Love            1060
Surprise         362
Fear             156
Name: emotion, dtype: int64

### **DATA CLEANING**

In [11]:
# # Remove rows without emojis

# drop_idx = []

# for text, idx in zip(df.text, df.index):
#     if len(emoji.distinct_emoji_list(text)) == 0:
#         drop_idx.append(idx)

# df.drop(df.index[drop_idx], inplace=True)

# df.reset_index(inplace=True, drop=True)

In [12]:
df.describe()

Unnamed: 0,emotion,text
count,22200,22200
unique,11,22200
top,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
freq,7601,1


In [13]:
pd.value_counts(df.emotion)

Neutral         7601
Joy             3394
Ambiguous       2626
Trust           1903
Disguist        1397
Anticipation    1312
Anger           1262
Sadness         1127
Love            1060
Surprise         362
Fear             156
Name: emotion, dtype: int64

In [14]:
df.head()

Unnamed: 0,emotion,text
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...


### **DATA PREPROCESSING**

In [15]:
text = df['text']
label = df['emotion']

In [16]:
le = LabelEncoder()
label = le.fit_transform(label)

In [17]:
text = text.str.replace(r"[+/#@&*$%:]", '', regex=True)
text = text.to_numpy()

In [18]:
text = text.tolist()

! rm input.txt
! rm output.txt

for i in text:
    text_file = open("input.txt", "a")
    text_file.write(i + '\n')
    text_file.close()

rm: cannot remove 'input.txt': No such file or directory
rm: cannot remove 'output.txt': No such file or directory


In [19]:
! chmod +x snowball-with-tamil/stemwords
! ./snowball-with-tamil/stemwords -l ta -i input.txt -o output.txt

In [20]:
text_file = open("output.txt", "r")

text = []

for line in text_file:
    text.append(line.strip())

In [21]:
with open('stopwords/tamil_stopwords.txt', encoding = 'utf-8') as f:
    tamil_stopwords = f.readlines()
    for i in range(len(tamil_stopwords)):
        tamil_stopwords[i] = re.sub('\n','',tamil_stopwords[i])
stopwords = tamil_stopwords

In [22]:
# Function for removing stop words
def stopwords_remove(text):
    # Appending words which are not stop words
    text = text.split(" ")
    removed = [s for s in text if s not in stopwords]  
    return removed

text = [stopwords_remove(s) for s in text]
for i in range(len(text)):
  text[i] = (" ".join(text[i]))

text = np.array(text)

### **FEATURE EXTRACTION**

In [23]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(text)
X = X.toarray()
print(X.shape)

y = label
print(y.shape)

(22200, 1111)
(22200,)


### **TRAIN-TEST SPLIT**

In [24]:
X_train = X[:17760]
X_test = X[17760:]

y_train = y[:17760]
y_test = y[17760:]

### **LOGISTIC REGRESSION**

In [25]:
# {'C': 1,
#  'dual': False,
#  'fit_intercept': False,
#  'penalty': 'l2',
#  'solver': 'newton-cg'}

# model = LogisticRegression()
model = LogisticRegression(C=1, dual=False, fit_intercept=False, 
                            penalty="l2", solver="newton-cg")
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.25      0.20      0.22       500
           1       0.20      0.08      0.11       244
           2       0.15      0.03      0.04       271
           3       0.18      0.05      0.08       277
           4       0.50      0.06      0.11        33
           5       0.48      0.50      0.49       702
           6       0.09      0.03      0.04       196
           7       0.38      0.66      0.48      1538
           8       0.27      0.10      0.15       241
           9       0.00      0.00      0.00        61
          10       0.24      0.14      0.18       377

    accuracy                           0.36      4440
   macro avg       0.25      0.17      0.17      4440
weighted avg       0.31      0.36      0.31      4440



### **SUPPORT VECTOR MACHINE**

In [26]:
# # Gridsearch

# parameters = {
#     "C": [1, 0.1, 0.01],
#     "kernel": ["linear", "poly", "rbf", "sigmoid"],
#     "degree": [2, 3, 4],
#     "gamma": ["scale", "auto"],
#     "shrinking": [True, False],
#     "probability": [True, False],
#     "decision_function_shape": ["ovo", "ovr"],
#     "break_ties": [True, False],
# }

# model = SVC()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X, y)

# grid_search.best_params_

In [27]:
# {'C': 1,
#  'break_ties': True,
#  'decision_function_shape': 'ovr',
#  'degree': 2,
#  'gamma': 'scale',
#  'kernel': 'linear',
#  'probability': True,
#  'shrinking': True}

# model = SVC()
model = SVC(
    C=1, break_ties=True, decision_function_shape="ovr", degree=2,
    gamma="scale", kernel="linear", probability=True, shrinking=True
)
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.33      0.03      0.06       500
           1       0.50      0.01      0.02       244
           2       0.50      0.00      0.01       271
           3       0.41      0.03      0.05       277
           4       0.36      0.12      0.18        33
           5       0.55      0.43      0.49       702
           6       0.11      0.01      0.01       196
           7       0.37      0.90      0.53      1538
           8       0.47      0.03      0.06       241
           9       0.00      0.00      0.00        61
          10       0.40      0.06      0.11       377

    accuracy                           0.39      4440
   macro avg       0.36      0.15      0.14      4440
weighted avg       0.40      0.39      0.28      4440



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **NAIVE BAYES**

In [28]:
model = GaussianNB()
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.11      0.01      0.02       500
           1       0.09      0.13      0.10       244
           2       0.08      0.03      0.05       271
           3       0.09      0.04      0.05       277
           4       0.01      0.55      0.02        33
           5       0.12      0.01      0.02       702
           6       0.08      0.24      0.12       196
           7       0.41      0.01      0.03      1538
           8       0.07      0.07      0.07       241
           9       0.02      0.30      0.03        61
          10       0.16      0.05      0.07       377

    accuracy                           0.05      4440
   macro avg       0.11      0.13      0.05      4440
weighted avg       0.21      0.05      0.04      4440



### **STOCHASTIC GRADIENT DESCENT**

In [29]:
model = SGDClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.23      0.05      0.08       500
           1       0.33      0.02      0.04       244
           2       0.09      0.07      0.08       271
           3       0.26      0.04      0.06       277
           4       0.35      0.18      0.24        33
           5       0.49      0.51      0.50       702
           6       0.09      0.04      0.05       196
           7       0.39      0.74      0.51      1538
           8       0.17      0.09      0.11       241
           9       0.00      0.00      0.00        61
          10       0.20      0.07      0.11       377

    accuracy                           0.36      4440
   macro avg       0.24      0.16      0.16      4440
weighted avg       0.31      0.36      0.29      4440



### **K NEAREST NEIGHBOURS**

In [30]:
model = KNeighborsClassifier(algorithm="ball_tree", leaf_size=25, n_neighbors=5, p=2, weights="distance")
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.15      0.13      0.14       500
           1       0.16      0.07      0.09       244
           2       0.19      0.05      0.08       271
           3       0.14      0.05      0.07       277
           4       0.30      0.09      0.14        33
           5       0.39      0.37      0.38       702
           6       0.14      0.09      0.11       196
           7       0.36      0.63      0.46      1538
           8       0.18      0.04      0.06       241
           9       0.00      0.00      0.00        61
          10       0.19      0.08      0.12       377

    accuracy                           0.31      4440
   macro avg       0.20      0.14      0.15      4440
weighted avg       0.27      0.31      0.27      4440



### **DECISION TREE**

In [31]:
model = DecisionTreeClassifier(criterion="gini", max_features=None, splitter="best")
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.16      0.15      0.16       500
           1       0.10      0.08      0.09       244
           2       0.07      0.05      0.06       271
           3       0.09      0.08      0.08       277
           4       0.10      0.06      0.08        33
           5       0.37      0.39      0.38       702
           6       0.08      0.06      0.06       196
           7       0.37      0.46      0.41      1538
           8       0.14      0.10      0.11       241
           9       0.02      0.02      0.02        61
          10       0.16      0.13      0.15       377

    accuracy                           0.27      4440
   macro avg       0.15      0.14      0.14      4440
weighted avg       0.25      0.27      0.26      4440



### **RANDOM FOREST**

In [32]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.23      0.09      0.13       500
           1       0.16      0.02      0.04       244
           2       0.33      0.02      0.03       271
           3       0.11      0.02      0.03       277
           4       0.44      0.12      0.19        33
           5       0.46      0.41      0.43       702
           6       0.12      0.02      0.03       196
           7       0.37      0.81      0.51      1538
           8       0.43      0.05      0.10       241
           9       0.00      0.00      0.00        61
          10       0.31      0.07      0.12       377

    accuracy                           0.37      4440
   macro avg       0.27      0.15      0.15      4440
weighted avg       0.32      0.37      0.28      4440



### **XGBOOST**

In [33]:
model = xgb.XGBClassifier(booster="gbtree", grow_policy="depthwise", learning_rate=0.1, max_depth=6,
                              sampling_method=6, tree_method="hist")
model.fit(X_train, y_train)

pred = model.predict(X_test).tolist()

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.31      0.05      0.09       500
           1       0.29      0.02      0.04       244
           2       0.00      0.00      0.00       271
           3       0.20      0.01      0.02       277
           4       0.44      0.12      0.19        33
           5       0.55      0.43      0.49       702
           6       0.23      0.02      0.03       196
           7       0.37      0.88      0.52      1538
           8       0.34      0.04      0.07       241
           9       0.00      0.00      0.00        61
          10       0.41      0.08      0.14       377

    accuracy                           0.39      4440
   macro avg       0.29      0.15      0.14      4440
weighted avg       0.35      0.39      0.29      4440



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
