# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
import re
from numpy import array
from tensorflow.keras.utils import to_categorical
import string
from re import sub
from sklearn.model_selection import train_test_split
import tensorflow as tf

import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding
from keras.layers import Dropout
import keras.optimizers

# Defining helper functions

In [2]:
def check_list(list, word):
    if word in list:
        return True
    else:
        return False

In [3]:
def open_list(filename):
    with open(filename, "r", encoding='utf-8') as f:
        mylist = f.readlines()

    word_list = []
    for line in mylist:
        if word_list:
            word_list.extend(line.split(", "))
        else:
            word_list = line.split(", ")

    return word_list

In [4]:
def clean_text(text):
    out = text.translate(str.maketrans('','',sub('\-', '', string.punctuation)))
    lowered = out.lower()
    return lowered.strip()


In [5]:
def generate_vector(list,vocabulary):
    bag_of_words = []
    for sentence_tokens in list:
        sent_vec = []
        for token in vocabulary:
            if token in sentence_tokens:
                sent_vec.append(1)
            else:
                sent_vec.append(0)
        bag_of_words.append(sent_vec)
    return np.asarray(bag_of_words)

In [6]:
def clean_tokens(list,vocab,expected_len):
    clean_list = []
    for sentence in list:
        cleaned_tokens = [token if token in vocab else '<UNK>' for token in sentence]
        padded_tokens = cleaned_tokens + ['<PAD>']*(expected_len - len(cleaned_tokens))
        clean_list.append(padded_tokens)
    
    return clean_list

def indexed_tok_seq(list,vocab):
    return [[vocab.index(token) for token in sent] for sent in list]

In [7]:
def max_tokens(list):
    total_tokens_in_text = []
    for sent in list:
        num_of_tokens = 0
        for token in sent:
            num_of_tokens+=1
        total_tokens_in_text.append(num_of_tokens)
    return total_tokens_in_text

In [8]:
def generate_frequency(list):
    freq_table = {}
    for sentence in list:
        for token in sentence:
            if token in freq_table:
                freq_table[token] += 1
            else:
                freq_table[token] = 1
    return freq_table

In [9]:
def generate_vocab(dict):
    vocabulary = ['<UNK>','<PAD>']
    for key in dict:
        if dict[key]>4:
            vocabulary.append(key)
    return vocabulary

In [10]:
def print_class_pred(predictions):
    for pred in predictions:
        if pred[0]>pred[1]:
            print('Formal Maltese')
        elif pred[0]<pred[1]:
            print('Informal Maltese')


In [11]:
def get_class_pred(predictions):
    for pred in predictions:
        if pred[0]>pred[1]:
            return 'Formal Maltese'
        elif pred[0]<pred[1]:
            return 'Informal Maltese'


# Reading Dataset

In [12]:
df_formal = pd.read_csv('Datasets/Annotated Comments Full.csv')
formal_text = df_formal['message'].tolist()
annotation_1 = df_formal['annotation1'].tolist()
annotation_2 = df_formal['annotation2'].tolist()
annotation_3 = df_formal['annotation3'].tolist()
index = df_formal['index'].tolist()

df_informal = pd.read_csv('Output Files/database.csv')
informal = df_informal['message'].tolist()[0:len(formal_text)]

informal = []
formal = []

for (text, anno_1,anno_2,anno_3,i) in zip(formal_text,annotation_1,annotation_2, annotation_3,index):
    if anno_1 != 'TO-DELETE' and anno_2 != 'TO-DELETE' and anno_3 != 'TO-DELETE' and anno_1 != 'TO-REMOVE':
        formal.append(text)
    elif anno_1 != 'TO-REMOVE':
        informal.append(text)
    
    if i == 4836:
        break
        
comments = []
labels = []

for text in formal:
    if type(text) == str:
        comments.append(text)
        labels.append('Formal Maltese')
    

for text in informal:
    if type(text) == str:
        comments.append(text)
        labels.append('Informal Maltese')
    

# Pre-Processing

In [13]:
x,y = np.asarray(comments),np.asarray(labels)
label_map = {cat:index for index,cat in enumerate(np.unique(y))}
Y = np.asarray([label_map[l] for l in y])

X = [clean_text(t).split(' ') for t in x]
X[:4]

[['u', 'ddaħkux', 'mela', 'ħsibtuhom', 'ċwieċ', 'lin-nies', 'jew'],
 ['qed', 'tara', 'fejn', 'ħa', 'tpoġġi', 'erba', 'pjanti'],
 ['minjaf', 'kemm', 'hemm', 'eżempji', 'oħra'],
 ['bla', 'rispett', 'flus', 'biss', 'jaraw', 'tal-mistħija']]

# Generating vocabulary from correct words

In [14]:
frequency = generate_frequency(X)
vocab = generate_vocab(frequency)

print('Length of vocabulary: ',len(vocab))

Length of vocabulary:  881


In [15]:
vocab = list(vocab)
vocab.append('<PAD>')
vocab.append('<UNK>')

max_tok = max(max_tokens(X))
X_cleaned_tokens = clean_tokens(X,vocab,max_tok)
indexed_X = indexed_tok_seq(X_cleaned_tokens,vocab)


In [16]:
# Train Test Split Function
X_train, X_test, Y_train, Y_test = train_test_split(indexed_X,Y,shuffle=42,test_size=0.10,random_state=15)

In [17]:
Y_train = to_categorical(Y_train, 2)
Y_test = to_categorical(Y_test, 2)

In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2,activation="sigmoid")
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [19]:
history = model.fit(array(X_train),array(Y_train), epochs=20,validation_split=0.1,verbose = 2)

Epoch 1/20
111/111 - 10s - loss: 0.6164 - accuracy: 0.7747 - val_loss: 0.5053 - val_accuracy: 0.8117 - 10s/epoch - 92ms/step
Epoch 2/20
111/111 - 4s - loss: 0.5294 - accuracy: 0.7803 - val_loss: 0.4822 - val_accuracy: 0.8117 - 4s/epoch - 37ms/step
Epoch 3/20
111/111 - 4s - loss: 0.5200 - accuracy: 0.7803 - val_loss: 0.4888 - val_accuracy: 0.8117 - 4s/epoch - 39ms/step
Epoch 4/20
111/111 - 4s - loss: 0.5083 - accuracy: 0.7803 - val_loss: 0.4596 - val_accuracy: 0.8117 - 4s/epoch - 38ms/step
Epoch 5/20
111/111 - 4s - loss: 0.4789 - accuracy: 0.7803 - val_loss: 0.4210 - val_accuracy: 0.8117 - 4s/epoch - 38ms/step
Epoch 6/20
111/111 - 4s - loss: 0.4452 - accuracy: 0.7826 - val_loss: 0.3956 - val_accuracy: 0.8168 - 4s/epoch - 38ms/step
Epoch 7/20
111/111 - 4s - loss: 0.4169 - accuracy: 0.7920 - val_loss: 0.3759 - val_accuracy: 0.8219 - 4s/epoch - 38ms/step
Epoch 8/20
111/111 - 4s - loss: 0.3911 - accuracy: 0.8084 - val_loss: 0.3573 - val_accuracy: 0.8575 - 4s/epoch - 38ms/step
Epoch 9/20
111

In [20]:
test_loss, test_acc = model.evaluate(array(X_test),array(Y_test))

print('Test Loss:', test_loss)
print('Test Accuracy:', round(test_acc*100,2))

Test Loss: 0.34470707178115845
Test Accuracy: 86.24


In [21]:
sample_text = 'Daniel muscat  passed'
text = clean_text(sample_text)
text = text.split(' ')


cleaned_tokens = [token if token in vocab else '<UNK>' for token in text]
padded_tokens = cleaned_tokens + ['<PAD>']*(max_tok - len(cleaned_tokens))

index_pred = [vocab.index(token) for token in padded_tokens]

In [22]:
predictions = model.predict([index_pred])
print_class_pred(predictions)

Informal Maltese


In [23]:
predictions

array([[0.04096895, 0.95714784]], dtype=float32)

# Generating Predictions

In [25]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2,activation="sigmoid")
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

Y_ = to_categorical(Y, 2)
history = model.fit(array(indexed_X),array(Y_), epochs=20,verbose = 2)

Epoch 1/20
137/137 - 9s - loss: 0.5876 - accuracy: 0.7836 - 9s/epoch - 67ms/step
Epoch 2/20
137/137 - 5s - loss: 0.5179 - accuracy: 0.7836 - 5s/epoch - 36ms/step
Epoch 3/20
137/137 - 5s - loss: 0.5091 - accuracy: 0.7836 - 5s/epoch - 36ms/step
Epoch 4/20
137/137 - 5s - loss: 0.4881 - accuracy: 0.7836 - 5s/epoch - 36ms/step
Epoch 5/20
137/137 - 5s - loss: 0.4469 - accuracy: 0.7877 - 5s/epoch - 36ms/step
Epoch 6/20
137/137 - 5s - loss: 0.4120 - accuracy: 0.8077 - 5s/epoch - 36ms/step
Epoch 7/20
137/137 - 5s - loss: 0.3761 - accuracy: 0.8416 - 5s/epoch - 35ms/step
Epoch 8/20
137/137 - 5s - loss: 0.3388 - accuracy: 0.8671 - 5s/epoch - 35ms/step
Epoch 9/20
137/137 - 5s - loss: 0.3139 - accuracy: 0.8761 - 5s/epoch - 35ms/step
Epoch 10/20
137/137 - 5s - loss: 0.2883 - accuracy: 0.8875 - 5s/epoch - 35ms/step
Epoch 11/20
137/137 - 5s - loss: 0.2701 - accuracy: 0.8910 - 5s/epoch - 35ms/step
Epoch 12/20
137/137 - 5s - loss: 0.2504 - accuracy: 0.8997 - 5s/epoch - 35ms/step
Epoch 13/20
137/137 - 5s 

In [42]:
df = pd.read_csv('Datasets/Annotated Comments Full.csv')
comments = df['message']
index = df['index']
annotation_1 = df['annotation1'].tolist()
annotation_2 = df['annotation2'].tolist()
annotation_3 = df['annotation3'].tolist()

In [43]:
for (c, anno_1,anno_2,anno_3,i) in zip(comments,annotation_1,annotation_2, annotation_3,index):
    if type(anno_1) == float and type(anno_2) == float and type(anno_3) == float:
        text = clean_text(c)
        text = text.split(' ')

        cleaned_tokens = [token if token in vocab else '<UNK>' for token in text]
        padded_tokens = cleaned_tokens + ['<PAD>']*(max_tok - len(cleaned_tokens))

        index_pred = [vocab.index(token) for token in padded_tokens]
        predictions = model.predict([index_pred])
        class_ = get_class_pred(predictions)
        if class_ == 'Informal Maltese':
            col1_index = (df.index==i)
            df.loc[col1_index, 'annotation1'] ='TO-REMOVE' 
            
    elif anno_1 == 'TO-REMOVE':
        text = clean_text(c)
        text = text.split(' ')

        cleaned_tokens = [token if token in vocab else '<UNK>' for token in text]
        padded_tokens = cleaned_tokens + ['<PAD>']*(max_tok - len(cleaned_tokens))

        index_pred = [vocab.index(token) for token in padded_tokens]
        predictions = model.predict([index_pred])
        class_ = get_class_pred(predictions)
        if class_ == 'Informal Maltese':
            col1_index = (df.index==i)
            df.loc[col1_index, 'annotation1'] ='TO-REMOVE'
        else:
            col1_index = (df.index==i)
            df.loc[col1_index, 'annotation1'] =''
   

In [44]:
len(df)

84109

In [45]:
df.to_csv ('Output Files\Maltese Comments v2.csv', index = False, header=True)

# Save as SQL

In [51]:
df = pd.read_csv('Output Files\Maltese Comments.csv')

In [52]:
comments = []
for index, row in df.iterrows():
    if row['annotation1'] != 'TO-REMOVE':
        if row['annotation1'] != 'TO-DELETE' and row['annotation2'] != 'TO-DELETE' and row['annotation3'] != 'TO-DELETE':
            comments.append(row['message'])
            
len(comments)

44580

In [54]:
diff_words = []

for c in comments:
    if type(c) == str:
        sent = clean_text(c).split(' ')
        for word in sent:
            if not word in diff_words:
                diff_words.append(word)
                
len(diff_words)

44842

In [57]:
maltese_words = open_list('Output Files/maltese_words.txt')
print('Number of final comments ',len(maltese_words))

final_comments = open_list('Output Files/final_comments.txt')
print('Number of final comments ',len(final_comments))

Number of final comments  4552977
Number of final comments  82926


In [58]:
print('Placing maltese words in a dictionary...')
dict_of_maltese_words = {}
for word in maltese_words:
    temp_list = []
    if len(word)>1:
        if word[0] in dict_of_maltese_words.keys():
            temp_list = dict_of_maltese_words[word[0]]
            temp_list.append(word)
            dict_of_maltese_words[word[0]] = temp_list
        else:
            temp_list.append(word)
            dict_of_maltese_words[word[0]] = temp_list

Placing maltese words in a dictionary...


In [62]:
print('Checking comments for any misspelled word...')

incorrect_words = []

for word in diff_words:
    if not word.isdigit():
        if len(word) >0 :
            if not check_list(incorrect_words,word):
                if word[0] in dict_of_maltese_words.keys():
                    if not check_list(dict_of_maltese_words[word[0]],word):
                        incorrect_words.append(word)


Checking comments for any misspelled word...


In [63]:
len(incorrect_words)

35359

In [64]:
incorrect_words

['u',
 'ddaħkux',
 'lin-nies',
 'erba',
 'minjaf',
 'tal-mistħija',
 'jiena',
 'miżmum',
 'lill-anzjani',
 'irridu',
 'ikisser',
 'l-ħanut',
 'fil-kbira',
 'berbgħin',
 'kilo',
 'użgur',
 'jitqażżes',
 'insibha',
 'il-vaċċin',
 'd-dar',
 'nħobhomx',
 'il-qtates',
 'imma',
 'naħqarhomx',
 'il-mewt',
 'tal-ewwel',
 'xser',
 'tas-sinjur',
 'umorist',
 's-sena',
 'bil-biki',
 'lill-familti',
 'bil-bagalji',
 'l-malti',
 'il-barrani',
 'l-opportunisti',
 'kollħa',
 'mal-partit',
 'jdaħħlukx',
 'jeħtieġ',
 'anka',
 'imutu',
 'gawdih',
 'iż-żmien',
 'it-tfal',
 'irid',
 'ibenġilha',
 'l-poplu',
 'razzist',
 'lill-familja',
 'inhu',
 'daharha',
 't-telfa',
 'jippreparaw',
 'tal-elezzjoni',
 'jaħasra',
 'bil-mentalita',
 'in-nies',
 'lill-vittma',
 'nwaqqaw',
 'id-dubju',
 'fuqha',
 'ħdejh',
 'tat-toqba',
 'teatrini',
 'bill-kliem',
 'iċemplu',
 'minix',
 's-siġar',
 'pero',
 'ikunu',
 'jitneħħew',
 'billl-covid',
 'fgati',
 'bill-maskla',
 'nifshom',
 't-tfal',
 'xqaltlu',
 'l-komodina',
 'lil