### Import the dataset (Youtube) checkout the column names and shape

In [49]:
import pandas as pd

df_train = pd.read_csv("dataset/train_csv.csv")
df_test = pd.read_csv('dataset/test_csv.csv')

df_train.shape, df_train.shape, df_train.columns


((1471, 3),
 (1471, 3),
 Index(['sentiment', 'label', 'label_cat'], dtype='object'))

## Character level normalization
since amharic many characters with the same sound we should make the data consistent

In [50]:
import re


def normalize_char_level_missmatch(input_token):
    rep1 = re.sub('[ሃኅኃሐሓኻ]', 'ሀ', input_token)
    rep2 = re.sub('[ሑኁዅ]', 'ሁ', rep1)
    rep3 = re.sub('[ኂሒኺ]', 'ሂ', rep2)
    rep4 = re.sub('[ኌሔዄ]', 'ሄ', rep3)
    rep5 = re.sub('[ሕኅ]', 'ህ', rep4)
    rep6 = re.sub('[ኆሖኾ]', 'ሆ', rep5)
    rep7 = re.sub('[ሠ]', 'ሰ', rep6)
    rep8 = re.sub('[ሡ]', 'ሱ', rep7)
    rep9 = re.sub('[ሢ]', 'ሲ', rep8)
    rep10 = re.sub('[ሣ]', 'ሳ', rep9)
    rep11 = re.sub('[ሤ]', 'ሴ', rep10)
    rep12 = re.sub('[ሥ]', 'ስ', rep11)
    rep13 = re.sub('[ሦ]', 'ሶ', rep12)
    rep14 = re.sub('[ዓኣዐ]', 'አ', rep13)
    rep15 = re.sub('[ዑ]', 'ኡ', rep14)
    rep16 = re.sub('[ዒ]', 'ኢ', rep15)
    rep17 = re.sub('[ዔ]', 'ኤ', rep16)
    rep18 = re.sub('[ዕ]', 'እ', rep17)
    rep19 = re.sub('[ዖ]', 'ኦ', rep18)
    rep20 = re.sub('[ጸ]', 'ፀ', rep19)
    rep21 = re.sub('[ጹ]', 'ፁ', rep20)
    rep22 = re.sub('[ጺ]', 'ፂ', rep21)
    rep23 = re.sub('[ጻ]', 'ፃ', rep22)
    rep24 = re.sub('[ጼ]', 'ፄ', rep23)
    rep25 = re.sub('[ጽ]', 'ፅ', rep24)
    rep26 = re.sub('[ጾ]', 'ፆ', rep25)
    # Normalizing words with Labialized Amharic characters such as በልቱዋል or  በልቱአል to  በልቷል
    rep27 = re.sub('(ሉ[ዋአ])', 'ሏ', rep26)
    rep28 = re.sub('(ሙ[ዋአ])', 'ሟ', rep27)
    rep29 = re.sub('(ቱ[ዋአ])', 'ቷ', rep28)
    rep30 = re.sub('(ሩ[ዋአ])', 'ሯ', rep29)
    rep31 = re.sub('(ሱ[ዋአ])', 'ሷ', rep30)
    rep32 = re.sub('(ሹ[ዋአ])', 'ሿ', rep31)
    rep33 = re.sub('(ቁ[ዋአ])', 'ቋ', rep32)
    rep34 = re.sub('(ቡ[ዋአ])', 'ቧ', rep33)
    rep35 = re.sub('(ቹ[ዋአ])', 'ቿ', rep34)
    rep36 = re.sub('(ሁ[ዋአ])', 'ኋ', rep35)
    rep37 = re.sub('(ኑ[ዋአ])', 'ኗ', rep36)
    rep38 = re.sub('(ኙ[ዋአ])', 'ኟ', rep37)
    rep39 = re.sub('(ኩ[ዋአ])', 'ኳ', rep38)
    rep40 = re.sub('(ዙ[ዋአ])', 'ዟ', rep39)
    rep41 = re.sub('(ጉ[ዋአ])', 'ጓ', rep40)
    rep42 = re.sub('(ደ[ዋአ])', 'ዷ', rep41)
    rep43 = re.sub('(ጡ[ዋአ])', 'ጧ', rep42)
    rep44 = re.sub('(ጩ[ዋአ])', 'ጯ', rep43)
    rep45 = re.sub('(ጹ[ዋአ])', 'ጿ', rep44)
    rep46 = re.sub('(ፉ[ዋአ])', 'ፏ', rep45)
    rep47 = re.sub('[ቊ]', 'ቁ', rep46)  # ቁ can be written as ቊ
    rep48 = re.sub('[ኵ]', 'ኩ', rep47)  # ኩ can be also written as ኵ
    return rep48


df_test['sentiment'] = df_test['sentiment'].apply(
    lambda x: normalize_char_level_missmatch(x))
df_train['sentiment'] = df_train['sentiment'].apply(
    lambda x: normalize_char_level_missmatch(x))


## Remove unnecessary symbols and emojis

In [51]:
def clean_text(row, options):
    """Removes url, mentions, emoji and uppercase from tweets"""

    if options['remove_url']:
        row = re.sub(r"(?:\@|https?\://)\S+", "", row)

    if options['remove_mentions']:
        row = re.sub("@[A-Za-z0-9_]+", "", row)

    if options['demojify']:
        emoj = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                          u"\U0001F680-\U0001F6FF"  # transport & map symbols
                          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                          u"\U00002500-\U00002BEF"  # chinese char
                          u"\U00002702-\U000027B0"
                          u"\U00002702-\U000027B0"
                          u"\U000024C2-\U0001F251"
                          u"\U0001f926-\U0001f937"
                          u"\U00010000-\U0010ffff"
                          u"\u2640-\u2642"
                          u"\u2600-\u2B55"
                          u"\u200d"
                          u"\u23cf"
                          u"\u23e9"
                          u"\u231a"
                          u"\ufe0f"  # dingbats
                          u"\u3030"
                          "]+", re.UNICODE)
        row = re.sub(emoj, '', row)

    return row


clean_config = {
    'remove_url': True,
    'remove_mentions': True,
    'lowercase': True,
    'demojify': True
}

df_test['sentiment'] = df_test['sentiment'].apply(
    clean_text, args=(clean_config,))
df_train['sentiment'] = df_train['sentiment'].apply(
    clean_text, args=(clean_config,))


In [52]:
df_test.tail(10)


Unnamed: 0,sentiment,label,label_cat
358,ወይኔ ደስ ሲል,negative,0
359,ዋውውውው በጣም አርፊ ነው,positive,2
360,ምን ለማስተማር እንደ ፈለጋችሁ አልገባኝም ?ምን የሚሉት አጨራረስ ነው,negative,0
361,እማየ ትሙት አናዳችሁኛ,negative,0
362,ምችት ይበላችው,neutral,1
363,ወይ ስተስጠሉ,negative,0
364,በለው ጭስ ውስጤ ነው,neutral,1
365,መጀመሪያ ላይ ደሜ ፈልቶ ነበር አሁን ግን ደስ አይልለኝ,positive,2
366,መሬት ጠብ እሚል ነገር አላየነም ያው ሙስናው ቀጥሏል ስራ አጡ ኑሮ ውድ...,negative,0
367,ይሄ ሰውዬ ደሞ ሰው አግቶ አለማጠን አያቅም እንዴ,negative,0


## Model 1 - Neural Networks

Extract the tokens from the dataset

In [53]:
import tensorflow as tf

tokenizer_train = tf.keras.preprocessing.text.Tokenizer(
    num_words=100, oov_token='<OOV>')
tokenizer_train.fit_on_texts(df_train['sentiment'])
word_index_train = tokenizer_train.word_index

tokenizer_test = tf.keras.preprocessing.text.Tokenizer(
    num_words=100, oov_token='<OOV>')
tokenizer_test.fit_on_texts(df_train['sentiment'])
word_index_test = tokenizer_test.word_index


- Represent the string in numbers
- Append 0s to the short strings to make them consistent

In [54]:
sequences_train = tokenizer_train.texts_to_sequences(df_train['sentiment'])
padded_train = tf.keras.preprocessing.sequence.pad_sequences(
    sequences_train, padding='post', maxlen=100)

sequences_test = tokenizer_test.texts_to_sequences(df_test['sentiment'])
padded_test = tf.keras.preprocessing.sequence.pad_sequences(
    sequences_test, padding='post', maxlen=100)

padded_train


array([[ 4,  6,  1, ...,  0,  0,  0],
       [ 3,  1,  1, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0],
       ...,
       [79, 79,  1, ...,  0,  0,  0],
       [ 1, 29,  1, ...,  0,  0,  0],
       [ 1,  1,  1, ...,  0,  0,  0]])

In [55]:
padded_train.shape, padded_test.shape


((1471, 100), (368, 100))

model creation

In [56]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')
])


In [57]:
model.compile(
    loss=tf.losses.categorical_crossentropy,
    optimizer=tf.optimizers.SGD(),
    metrics=['accuracy']
)


In [58]:
model.fit(padded_train, df_train['label_cat'], epochs=50, validation_data=(
    padded_test, df_test['label_cat']))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x28675388040>

## Model 2 - Naive Bayes

In [59]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_model.fit(padded_train, df_train['label_cat'])


In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix

predictions = nb_model.predict(padded_test)
accuracy_score(df_test['label_cat'], predictions), confusion_matrix(
    df_test['label_cat'], predictions)


(0.22826086956521738,
 array([[  0, 102,   1],
        [  0,  77,   0],
        [  0, 181,   7]], dtype=int64))

## Model 3 - SVM

In [61]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(padded_train, df_train['label_cat'])


In [62]:
predictions = svm_model.predict(padded_test)
accuracy_score(df_test['label_cat'], predictions), confusion_matrix(
    df_test['label_cat'], predictions)


(0.5163043478260869,
 array([[  8,   3,  92],
        [  1,   3,  73],
        [  4,   5, 179]], dtype=int64))

## Export the model for external use

In [69]:
from joblib import Parallel, delayed
import joblib

joblib.dump(svm_model, "svm_model.pkl")


['svm_model.pkl']

In [68]:
text = "የቆሸሸ ሟች ለመደበቅ እዚያ እንደቆሙ አውቃለሁ"
output = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

model = joblib.load("svm_model.pkl")

text = normalize_char_level_missmatch(text)

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])
padded = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, padding='post', maxlen=100)

output[model.predict(padded)[0]]


'positive'