### **ENVIRONMENT SETUP**

In [1]:
! pip install -q emoji

[?25l[K     |█▍                              | 10 kB 14.9 MB/s eta 0:00:01[K     |██▊                             | 20 kB 18.2 MB/s eta 0:00:01[K     |████                            | 30 kB 8.8 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 4.5 MB/s eta 0:00:01[K     |██████▉                         | 51 kB 5.4 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 5.7 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 5.4 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 6.0 MB/s eta 0:00:01[K     |████████████▎                   | 92 kB 6.1 MB/s eta 0:00:01[K     |█████████████▋                  | 102 kB 5.8 MB/s eta 0:00:01[K     |███████████████                 | 112 kB 5.8 MB/s eta 0:00:01[K     |████████████████▎               | 122 kB 5.8 MB/s eta 0:00:01[K     |█████████████████▊              | 133 kB 5.8 MB/s eta 0:00:01[K     |███████████████████             | 143 kB 5.8 MB/s eta 0:00:01[K   

In [2]:
%cd /content/
! git clone https://github.com/srivarshan-s/understanding-emojis-in-tamil-emotion-detection.git
%cd understanding-emojis-in-tamil-emotion-detection/

/content
Cloning into 'understanding-emojis-in-tamil-emotion-detection'...
remote: Enumerating objects: 122, done.[K
remote: Counting objects: 100% (122/122), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 122 (delta 29), reused 116 (delta 28), pack-reused 0[K
Receiving objects: 100% (122/122), 1.76 MiB | 12.76 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/understanding-emojis-in-tamil-emotion-detection


### **IMPORT LIBRARIES**

In [3]:
import numpy as np
import pandas as pd
import emoji
import re

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

### **STEMMER**

In [None]:
! rm input.txt
! rm output.txt
%cd snowball-with-tamil/
! make
%cd ..

### **IMPORT DATASET**

In [5]:
header_names = ["emotion", "text"]
df_train = pd.read_csv("data/ta-emotion10-train.csv", sep="\t", names=header_names)
df_dev = pd.read_csv("data/ta-emotion10-dev.csv", sep="\t", names=header_names)
df_test = pd.read_csv("data/task_a_test.csv", sep="\t", names=header_names)

In [6]:
df = pd.concat([df_train, df_dev, df_test])

In [7]:
df.head()

Unnamed: 0,emotion,text
0,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
1,Anger,மானம் கேட்ட அன்புமணி
2,Neutral,தவறு இஸ்ரேல் இருக்காது இதை நான் கூறவில்லை ஹமாஸ...
3,Joy,கொங்கு நாட்டு சிங்கம் உன்மையும் நேர்மையும் உலை...
4,Neutral,இவர் யார்? ஒவ்வொரு வார்த்தையும் முன்னுக்கு பின...


In [8]:
df.describe()

Unnamed: 0,emotion,text
count,22200,22200
unique,11,22200
top,Neutral,நாளைக்கு அரிசிக்கு இந்த நிலமை வந்தா 🙂
freq,7601,1


In [9]:
df.emotion.unique()

array(['Neutral', 'Anger', 'Joy', 'Disguist', 'Trust', 'Anticipation',
       'Ambiguous', 'Love', 'Surprise', 'Sadness', 'Fear'], dtype=object)

In [10]:
pd.value_counts(df.emotion)

Neutral         7601
Joy             3394
Ambiguous       2626
Trust           1903
Disguist        1397
Anticipation    1312
Anger           1262
Sadness         1127
Love            1060
Surprise         362
Fear             156
Name: emotion, dtype: int64

### **DATA CLEANING**

In [11]:
# Remove rows without emojis

drop_idx = []

for text, idx in zip(df.text, df.index):
    if len(emoji.distinct_emoji_list(text)) == 0:
        drop_idx.append(idx)

df.drop(df.index[drop_idx], inplace=True)

df.reset_index(inplace=True, drop=True)

In [12]:
df.describe()

Unnamed: 0,emotion,text
count,1818,1818
unique,11,1818
top,Joy,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
freq,585,1


In [13]:
pd.value_counts(df.emotion)

Joy             585
Neutral         401
Trust           183
Love            143
Ambiguous       139
Sadness         120
Anticipation     73
Disguist         69
Anger            59
Surprise         34
Fear             12
Name: emotion, dtype: int64

In [14]:
df.head()

Unnamed: 0,emotion,text
0,Surprise,அண்ணே இங்கேயும் வந்துட்டீங்களா🤣🤣🤣😂😂
1,Ambiguous,யாருக்கு தெரியும் பொண்ணு பார்க்க கூட குடும்பத்...
2,Ambiguous,அது என்ன 🔥பனியிடை நீக்கம் பனி நீக்கம் தான் செறி 👍
3,Neutral,தி மு க விற்க்கு எனது 7 கோடி நன்றிகள்... அந்த ...
4,Love,கணவன் அமைவதெல்லாம் இறைவன் கொடுத்த வரம் ❤️


### **DATA PREPROCESSING**

In [15]:
text = df['text']
label = df['emotion']

In [16]:
le = LabelEncoder()
label = le.fit_transform(label)

In [17]:
text = text.str.replace(r"[+/#@&*$%:]", '', regex=True)
text = text.to_numpy()

In [None]:
text = text.tolist()

! rm input.txt
! rm output.txt

for i in text:
    text_file = open("input.txt", "a")
    text_file.write(i + '\n')
    text_file.close()

In [19]:
! chmod +x snowball-with-tamil/stemwords
! ./snowball-with-tamil/stemwords -l ta -i input.txt -o output.txt

In [20]:
text_file = open("output.txt", "r")

text = []

for line in text_file:
    text.append(line.strip())

In [21]:
with open('stopwords/tamil_stopwords.txt', encoding = 'utf-8') as f:
    tamil_stopwords = f.readlines()
    for i in range(len(tamil_stopwords)):
        tamil_stopwords[i] = re.sub('\n','',tamil_stopwords[i])
stopwords = tamil_stopwords

In [22]:
# Function for removing stop words
def stopwords_remove(text):
    # Appending words which are not stop words
    text = text.split(" ")
    removed = [s for s in text if s not in stopwords]  
    return removed

text = [stopwords_remove(s) for s in text]
for i in range(len(text)):
  text[i] = (" ".join(text[i]))

text = np.array(text)

### **FEATURE EXTRACTION**

In [23]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(text)
X = X.toarray()
print(X.shape)

y = label
print(y.shape)

(1818, 318)
(1818,)


### **LOGISTIC REGRESSION**

In [24]:
# # Gridsearch

# parameters = {
#     "penalty": ["l1", "l2", "elasticnet", "none"],
#     "dual": [True, False],
#     "C": [1, 0.1, 0.01],
#     "fit_intercept": [True, False],
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
# }

# model = LogisticRegression()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X, y)

# grid_search.best_params_

In [25]:
# {'C': 1,
#  'dual': False,
#  'fit_intercept': False,
#  'penalty': 'l2',
#  'solver': 'newton-cg'}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # model = LogisticRegression()
    model = LogisticRegression(C=1, dual=False, fit_intercept=False, 
                               penalty="l2", solver="newton-cg")
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.13      0.12      0.12       139
           1       0.10      0.03      0.05        59
           2       0.00      0.00      0.00        73
           3       0.17      0.06      0.09        69
           4       0.00      0.00      0.00        12
           5       0.38      0.63      0.48       585
           6       0.15      0.03      0.06       143
           7       0.23      0.25      0.24       401
           8       0.27      0.16      0.20       120
           9       0.00      0.00      0.00        34
          10       0.24      0.16      0.20       183

    accuracy                           0.30      1818
   macro avg       0.15      0.13      0.13      1818
weighted avg       0.25      0.30      0.26      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **SUPPORT VECTOR MACHINE**

In [27]:
# # Gridsearch

# parameters = {
#     "C": [1, 0.1, 0.01],
#     "kernel": ["linear", "poly", "rbf", "sigmoid"],
#     "degree": [2, 3, 4],
#     "gamma": ["scale", "auto"],
#     "shrinking": [True, False],
#     "probability": [True, False],
#     "decision_function_shape": ["ovo", "ovr"],
#     "break_ties": [True, False],
# }

# model = SVC()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X[:100], y[:100])

# grid_search.best_params_

In [28]:
# {'C': 1,
#  'break_ties': True,
#  'decision_function_shape': 'ovr',
#  'degree': 2,
#  'gamma': 'scale',
#  'kernel': 'linear',
#  'probability': True,
#  'shrinking': True}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # model = SVC()
    model = SVC(C=1, break_ties=True, decision_function_shape="ovr", degree=2, 
                gamma="scale", kernel="linear", probability=True, 
                shrinking=True)
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.05      0.01      0.01       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.00      0.00      0.00        69
           4       0.00      0.00      0.00        12
           5       0.35      0.73      0.47       585
           6       0.25      0.01      0.03       143
           7       0.22      0.25      0.23       401
           8       0.27      0.06      0.10       120
           9       0.00      0.00      0.00        34
          10       0.29      0.13      0.18       183

    accuracy                           0.31      1818
   macro avg       0.13      0.11      0.09      1818
weighted avg       0.23      0.31      0.23      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **NAIVE BAYES**

In [29]:
kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = GaussianNB()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.11      0.09      0.10       139
           1       0.06      0.22      0.10        59
           2       0.07      0.19      0.10        73
           3       0.08      0.16      0.11        69
           4       0.01      0.08      0.02        12
           5       0.24      0.05      0.08       585
           6       0.14      0.34      0.19       143
           7       0.19      0.07      0.10       401
           8       0.20      0.20      0.20       120
           9       0.06      0.29      0.10        34
          10       0.17      0.13      0.15       183

    accuracy                           0.12      1818
   macro avg       0.12      0.17      0.11      1818
weighted avg       0.18      0.12      0.11      1818



### **STOCHASTIC GRADIENT DESCENT**

In [33]:
# # Gridsearch

# parameters = {
#     "loss": ["hinge", "log_loss", "log", "modified_huber", "squared_hinge", 
#              "perceptron", "squared_error", "huber", "epsilon_insensitive", 
#              "squared_epsilon_insensitive"],
#     "penalty": ["l2", "l1", "elasticnet"],
#     "alpha": [0.0001, 0.00001, 0.001],
#     "l1_ratio": [0.15, 0.20, 0.10],
#     "fit_intercept": [True, False],
#     "shuffle": [True, False],
#     "n_jobs": [-1],
#     "early_stopping": [True, False],
#     "warm_start": [True, False],
# }

# model = SGDClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X[:100], y[:100])

# grid_search.best_params_

In [38]:
# {'alpha': 0.0001,
#  'early_stopping': False,
#  'fit_intercept': True,
#  'l1_ratio': 0.15,
#  'loss': 'perceptron',
#  'n_jobs': -1,
#  'penalty': 'l2',
#  'shuffle': True,
#  'warm_start': False}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # model = SGDClassifier(alpha=0.0001, early_stopping=False, fit_intercept=True, l1_ratio=0.15, loss="perceptron",
    #                       n_jobs=-1, penalty="l2", shuffle=True, warm_start=False)
    model = SGDClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.10      0.08      0.09       139
           1       0.08      0.08      0.08        59
           2       0.05      0.03      0.04        73
           3       0.14      0.12      0.13        69
           4       0.20      0.17      0.18        12
           5       0.40      0.56      0.46       585
           6       0.06      0.03      0.04       143
           7       0.23      0.14      0.18       401
           8       0.18      0.23      0.20       120
           9       0.04      0.03      0.03        34
          10       0.23      0.24      0.23       183

    accuracy                           0.27      1818
   macro avg       0.16      0.16      0.15      1818
weighted avg       0.24      0.27      0.25      1818



### **K NEAREST NEIGHBOURS**

In [43]:
# # Gridsearch

# parameters = {
#     "n_neighbors": [3, 5, 7],
#     "weights": ["uniform", "distance"],
#     "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
#     "leaf_size": [25, 30, 35],
#     "p": [1, 2],
# }

# model = KNeighborsClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X[:100], y[:100])

# grid_search.best_params_

In [49]:
# {'algorithm': 'ball_tree',
#  'leaf_size': 25,
#  'n_neighbors': 5,
#  'p': 2,
#  'weights': 'distance'}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = KNeighborsClassifier(algorithm="ball_tree", leaf_size=25, n_neighbors=5, p=2, weights="distance")
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.10      0.33      0.15       139
           1       0.03      0.02      0.02        59
           2       0.09      0.04      0.06        73
           3       0.07      0.04      0.05        69
           4       0.00      0.00      0.00        12
           5       0.41      0.45      0.43       585
           6       0.15      0.07      0.10       143
           7       0.22      0.18      0.20       401
           8       0.19      0.08      0.11       120
           9       0.04      0.03      0.03        34
          10       0.27      0.19      0.23       183

    accuracy                           0.24      1818
   macro avg       0.14      0.13      0.13      1818
weighted avg       0.25      0.24      0.24      1818



### **DECISION TREE**

In [58]:
# # Gridsearch

# parameters = {
#     "criterion": ["gini", "entropy", "log_loss"],
#     "splitter": ["best", "random"],
#     "max_features": ["auto", "sqrt", "log2", None],
# }

# model = DecisionTreeClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X, y)

# grid_search.best_params_

In [59]:
# {'criterion': 'gini', 'max_features': None, 'splitter': 'best'}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = DecisionTreeClassifier(criterion="gini", max_features=None, splitter="best")
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.14      0.13      0.14       139
           1       0.06      0.05      0.05        59
           2       0.06      0.04      0.05        73
           3       0.07      0.06      0.06        69
           4       0.17      0.08      0.11        12
           5       0.40      0.50      0.44       585
           6       0.11      0.10      0.11       143
           7       0.25      0.24      0.25       401
           8       0.15      0.13      0.14       120
           9       0.12      0.06      0.08        34
          10       0.17      0.14      0.15       183

    accuracy                           0.26      1818
   macro avg       0.15      0.14      0.14      1818
weighted avg       0.24      0.26      0.25      1818



### **RANDOM FOREST**

In [61]:
# # Gridsearch

# parameters = {
#     "n_estimators": [100, 200, 300],
#     "criterion": ["gini", "entropy", "log_loss"],
#     "max_features": ["auto", "sqrt", "log2", None],
#     "bootstrap": [True, False],
#     "oob_score": [True, False],
#     "warm_start": [True, False],
#     "class_weight": ["balanced", "balanced_subsample", None],
# }

# model = RandomForestClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X[:100], y[:100])

# grid_search.best_params_

In [65]:
# {'bootstrap': True,
#  'class_weight': None,
#  'criterion': 'entropy',
#  'max_features': 'log2',
#  'n_estimators': 100,
#  'oob_score': False,
#  'warm_start': False}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion="entropy", max_features="log2",
    #                                n_estimators=100, oob_score=False, warm_start=False)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.17      0.07      0.10       139
           1       0.05      0.02      0.02        59
           2       0.00      0.00      0.00        73
           3       0.05      0.01      0.02        69
           4       0.00      0.00      0.00        12
           5       0.37      0.63      0.46       585
           6       0.14      0.04      0.06       143
           7       0.23      0.29      0.26       401
           8       0.28      0.12      0.17       120
           9       0.00      0.00      0.00        34
          10       0.23      0.11      0.15       183

    accuracy                           0.30      1818
   macro avg       0.14      0.12      0.11      1818
weighted avg       0.24      0.30      0.25      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **XGBOOST**

In [None]:
# # Gridsearch

# parameters = {
#     "booster": ["gbtree", "gblinear", "dart"],
#     "learning_rate": [0.3, 0.1, 0.01],
#     "max_depth": [6, 10],
#     "sampling_method": ["uniform", "gradient_based"],
#     "tree_method": ["auto", "exact", "approx", "hist", "gpu_hist"],
#     "grow_policy": ["depthwise", "lossguide"],
# }

# model = xgb.XGBClassifier()

# grid_search = GridSearchCV(model, parameters, n_jobs=-1, scoring="f1_weighted")
# grid_search.fit(X[:100], y[:100])

# grid_search.best_params_

In [68]:
# {'booster': 'gbtree',
#  'grow_policy': 'depthwise',
#  'learning_rate': 0.1,
#  'max_depth': 6,
#  'sampling_method': 'uniform',
#  'tree_method': 'hist'}

kf = KFold(n_splits=5)

pred = []

for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = xgb.XGBClassifier(booster="gbtree", grow_policy="depthwise", learning_rate=0.1, max_depth=6,
                              sampling_method=6, tree_method="hist")
    model.fit(X_train, y_train)

    pred_test = model.predict(X_test).tolist()
    pred += pred_test

print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.21      0.09      0.13       139
           1       0.00      0.00      0.00        59
           2       0.00      0.00      0.00        73
           3       0.12      0.03      0.05        69
           4       0.00      0.00      0.00        12
           5       0.36      0.70      0.48       585
           6       0.14      0.05      0.07       143
           7       0.25      0.26      0.26       401
           8       0.38      0.12      0.19       120
           9       0.00      0.00      0.00        34
          10       0.27      0.14      0.18       183

    accuracy                           0.32      1818
   macro avg       0.16      0.13      0.12      1818
weighted avg       0.26      0.32      0.26      1818



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
