## Importing required libraries

In [1]:
# Import pandas
import pandas as pd
import os
import re
import langid
import string
import numpy
import fitz 
import nltk
import json
import requests
from Levenshtein import distance


## Helper Functions

In [2]:
def has_numbers(inputString):
    return bool(re.search(r'\d', inputString))

In [3]:
def clean_list(list):
    new_list = []
    for word in list:
        new_word = clean_word(word)
        new_list.append(new_word)
    return new_list

In [4]:
def clean_word(word):
    word = re.sub(r'[^\w\s]', '', word)
    word = word.lower()
    
    return word

In [5]:
WORD = re.compile(r'\w+')
def tokenize(text):
    words = WORD.findall(text)
    return words

In [6]:
def check_list(list,word):
    if word in list:
        return True
    else:
        return False
    

In [7]:
# https://stackoverflow.com/a/49146722/330558
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

## Importing different csv files containing comments

In [8]:
filenames = next(os.walk('Comments/'), (None, None, []))[2]  # [] if no file
filenames

['cospicua_primary_comments.csv',
 'gwida_comments.csv',
 'ira_losco_comments.csv',
 'kelmakelma_comments.csv',
 'liqourish_comments.csv',
 'malta_dizastru_totali_comments.csv',
 'malta_traffic_comments.csv',
 'malti_madwarna_comments.csv',
 'newsbook_comments.csv',
 'one_chick_and_a_yorkie_comments.csv',
 'peristyle_comments.csv',
 'replay_comments.csv',
 'ricetti_maltin_comments.csv',
 'tvm_comments.csv',
 'xfactor_comments.csv']

In [9]:
dataframes = []
for f in filenames:
    df = pd.read_csv("Comments/"+f)
    df = df['message']
    dataframes.append(df)


In [10]:
len(dataframes)

15

## Appending each dataframe into one list

In [11]:
raw_comments = []
for df in dataframes:
    temp_comments = df.to_numpy().tolist()
    if raw_comments:
        raw_comments.extend(temp_comments)
    else:
        raw_comments = temp_comments

In [12]:
len(raw_comments)

119572

## Ensuring that each comment is not longer than 1 sentence

In [13]:
list_of_comments = []
for comment in raw_comments:
    comment = str(comment).strip()
    a_list = nltk.tokenize.sent_tokenize(comment)
    if list_of_comments:
        list_of_comments.extend(a_list)
    else:
        list_of_comments = a_list

In [14]:
len(list_of_comments)

149867

In [15]:
#list_of_comments = list_of_comments[0:20000]

## Cleaning the list of comments

In [16]:
words_to_remove = ['nan','Goodluck','Good luck','Good Luck','GoodLuck','Welldone', 'Well done','RIP','rip','proud'
                   ,'Proud','well','done','Rip','Bravu','proset','Bravi','Brava','God','Grazzi','Thank']
words_re = re.compile("|".join(words_to_remove))

cleaned_comments = []
for c in list_of_comments:
    
    c = str(c).strip()
    if words_re.search(c):        
        continue
    else:
        c = remove_emoji(c)
        cleaned_comments.append(c)

In [17]:
len(cleaned_comments)

121008

In [18]:
cleaned_comments

['bravu lyon',
 'THANKS FOR YOUR APPRECIATION.....',
 "RevCan Emmanuel Schembri grazzi lilek Father, ta' kemm tiehu hsiebna!",
 'Prosit ms falzon ezempju car ta kif tmexxi skola b determinazjoni u hegga.grazzi talli dejjem kont hemm biex tisma l kulhadd.jien hdimt mieghek zmien ilu u vera mara kif jejdu tal genn prosit',
 'Nixtieq nirringrazzja lil Mr Pulis u lil Ms Mallia  ( Year 5) tal hidma kontinwa li taw lit tfal taghna .',
 'Ser nimmisjawkom u nispera li nergu niltaqu lura jekk il bambin irid ma ndumux .',
 'Sahha lil kullhadd .',
 'grazzi ta kollox lil kullhadd ',
 'Ha nimisawkim hafna u anke tfal zgur li nigu nitawlulkhom .',
 'Fismi u fisem shabi nejdulkom grazzi tal hsieb , ghax ghal kemm ahna ckejknin imma ma gejnix minsijin ',
 'George Vella',
 'Carol Grech ',
 'Prosit !',
 '!',
 'Verygood',
 'bravu sabih',
 'My boy',
 'Rachel Brignoli',
 'Kif nista namel bix naplika pls tiji qat ma kinu imoru bil van',
 'Bernice Bellotti hemm post ohra ghal min ha japplika l-ewwel darba',


## Removing comments which are not longer than 2 characters

In [19]:
for comment in cleaned_comments:
    if len(comment)<2:
        cleaned_comments.remove(comment)

In [20]:
len(cleaned_comments)

114199

## Checking for language of comments

As the library is not very accurate and most comments are either english or maltese, comments in english are removed but any comment classified as any other language is kept

In [21]:
final_comments = []
for comment in cleaned_comments:
    lang = langid.classify(comment)
    if lang[0] != 'en':
        final_comments.append(comment)

In [22]:
len(final_comments)  
with open('maltese_comments.csv', 'w', encoding='utf-8') as f:
    for item in final_comments:
        f.write("%s, " % item)

## Open lists of Maltese words to compile our lexicon

In [23]:
data = []
for line in open('Resources/maltese_words.json', 'r',encoding='utf-8'):
    data.append(json.loads(line))
    
lexemes = []
for d in data:
    lemma = clean_word(d['lemma'])
    if len(lemma)>1:
        lexemes.append(lemma)
    if "alternatives" in d.keys():
        lexemes.append(d['alternatives'])

In [24]:
data = []
for line in open('Resources/other_maltese_words.json', 'r',encoding='utf-8'):
    data.append(json.loads(line))
    
for d in data:
    if "surface_form" in d:
        lexemes.append(d['surface_form'])
    if "plural_form" in d:
        lexemes.append(d['plural_form'])
    if "alternatives" in d:
        lexemes.append(d['alternatives'])

In [25]:
len(lexemes)

4548284

In [26]:
maltese_words = []
for word in lexemes:
    if type(word) != str:
        maltese_words.extend(word)
    elif ' ' in word:
        words = tokenize(word)
        maltese_words.extend(words)
    else:
        maltese_words.append(word)

maltese_words = list(filter(lambda a: a != '', maltese_words))

In [27]:
len(maltese_words)

4553264

In [28]:
special_characters = "_пc*"

final_words = []

for word in maltese_words:
    if not has_numbers(word) and not any(c in special_characters for c in word):
        final_words.append(word.lower())
 

In [29]:
dict_of_maltese_words = {}
for word in final_words:
    temp_list = []
    if word[0] in dict_of_maltese_words.keys():
        temp_list = dict_of_maltese_words[word[0]]
        temp_list.append(word)
        dict_of_maltese_words[word[0]] = temp_list
    else:
        temp_list.append(word)
        dict_of_maltese_words[word[0]] = temp_list

## Check which words are misspelled

In [30]:
incorrect_words = []

for sentence in final_comments:
    sent = tokenize(sentence)
    for word in sent:
        if not word.isdigit():
            word = clean_word(word)
            if not check_list(incorrect_words,word):
                if word[0] in dict_of_maltese_words.keys():
                    if not check_list(dict_of_maltese_words[word[0]],word):
                        incorrect_words.append(word)

In [31]:
incorrect_words

with open('incorrect_words.csv', 'w', encoding='utf-8') as f:
    for item in incorrect_words:
        f.write("%s, " % item)

In [32]:
len(incorrect_words)

54465

In [33]:
to_sub = {}

for word in incorrect_words:
    r = requests.get(' https://mlrs.research.um.edu.mt/resources/gabra-api/lexemes/search_suggest?s='+word)
    if(r.json()['results']):
        result = r.json()['results'][0]['lexeme']['lemma']
        to_sub[word] = result
        incorrect_words.remove(word)
  

In [34]:
to_sub

{'ezempju': 'eżempju',
 'hegga': 'ħeġġa',
 'kulhadd': 'kulħadd',
 'mieghek': 'miegħek',
 'genn': 'ġenn',
 'hidma': 'ħidma',
 'taghna': 'tagħna',
 'ha': 'ħa',
 'zgur': 'żgur',
 'ghax': 'għax',
 'ahna': 'aħna',
 'genituri': 'ġenituri',
 'pacenzja': 'paċenzja',
 'ghajnuna': 'għajnuna',
 'hadd': 'ħadd',
 'maggoranza': 'maġġoranza',
 'habba': 'ħabba',
 'messagg': 'messaġġ',
 'hin': 'ħin',
 'imbaghad': 'imbagħad',
 'herqa': 'ħerqa',
 'mizura': 'miżura',
 'gdid': 'ġdid',
 'hazin': 'ħażin',
 'hat': 'ħat',
 'pjacir': 'pjaċir',
 'sbieh': 'sbieħ',
 'xoghol': 'xogħol',
 'ghalxejn': 'għalxejn',
 'abbuz': 'abbuż',
 'strieh': 'straħ',
 'naha': 'naħa',
 'gmiel': 'ġmiel',
 'gurnata': 'ġurnata',
 'wicc': 'wiċċ',
 'taghmil': 'tagħmil',
 'ghala': 'għala',
 'hlief': 'ħlief',
 'gid': 'ġid',
 'haga': 'ħaġa',
 'dahk': 'daħk',
 'pajjiz': 'pajjiż',
 'fiducja': 'fiduċja',
 'bzonn': 'bżonn',
 'ghagla': 'għaġla',
 'ghalik': 'għalik',
 'medicina': 'mediċina',
 'wiehed': 'wieħed',
 'hara': 'ħara',
 'doza': 'doża',
 

In [None]:
len(incorrect_words)

In [9]:
def open_list(filename):
    with open(filename, "r", encoding='utf-8') as f:
        mylist = f.readlines()

    word_list = []
    for line in mylist:
        if word_list:
            word_list.extend(line.split(", "))
        else:
            word_list = line.split(", ")

    return word_list

In [11]:
maltese_words = open_list('CSV_files/maltese_words.txt')
print(len(maltese_words))

4552977


In [19]:
#https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/
def get_words(word,dataset):
    list_of_dist = {}
    
    for data in dataset: 
        dist = distance(data, word)
        if dist < 3:
            list_of_dist[data] = dist

    dict(sorted(list_of_dist.items(), key=lambda item: item[1]))
    #print(len(list_of_dist))
    
    
    return list_of_dist

In [20]:
list_of_dist = get_words('ghax',maltese_words)
list_of_dist

{'ħrax': 2,
 'għad': 2,
 'għat': 2,
 'għex': 2,
 'għam': 2,
 'grix': 2,
 'gżar': 2,
 'għaxq': 2,
 'għaxi': 2,
 'għan': 2,
 'għar': 2,
 'grad': 2,
 'mhux': 2,
 'ħdax': 2,
 'għax': 1,
 'għal': 2,
 'għatx': 2,
 'tnax': 2,
 'gwaj': 2,
 'grat': 2,
 'gahan': 2,
 'ghada': 2,
 'than': 2,
 'ghaba': 2,
 'tham': 2,
 'gwan': 2,
 'ghabex': 2,
 'nhar': 2,
 'hux': 2,
 'ghain': 2,
 'ghij': 2,
 'mgħax': 2,
 'ghar': 1,
 'tax': 2,
 'gћan': 2,
 'haxi': 2,
 'that': 2,
 'gżat': 2,
 'gżaw': 2,
 'gażax': 2,
 'gżatx': 2,
 'gżawx': 2,
 'wżax': 2,
 'għadx': 2,
 'għix': 2,
 'għamx': 2,
 'għarx': 2,
 'użax': 2,
 'nqax': 2,
 'gnaċ': 2}

## Substituting every occurence of the incorrect word with the correct one

In [None]:
common_misspelled = {
    "tiegħi" : "tijaj"
    "għax" : "ax"
    "jj" : "jekk jogħġbok"
    "andi" : "għandi"
    "tiegħi" : "tijaj"

}


string_a = re.sub(r'(cat|dog)', 'pet', "Mark owns a dog and Mary owns a cat.")

#https://stackabuse.com/replace-occurrences-of-a-substring-in-string-with-python/