# This notebook loads N-gram models (bi-gram, tri-gram, and quad-gram) combined for language identification. 
* The LID models were trained on the data from:

* Vukuzenzele
* NCHLT

In [None]:
import pickle
from collections import Counter
import math
import re
import os
import string
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

* Supporting functions

In [None]:
def create_n_grams(lang, num, k): #returns top k n-grams according to frequency
    lang = " ".join(lang)
    words = re.sub('['+string.punctuation+']', '', lang) #  punctuation removed
    words = words.lower()
    words = re.sub('\s+', ' ', words).strip() # replaces multiple spaces, newline tabs with a single space
    words = words.replace(' ','_')# so that we can visualise spaces easily
    grams = {}
    #print (words)
    for i in range(len(words)-num):
        temp = words[i:i+num]
        if temp in grams:
            grams[temp] += 1
        else:
            grams[temp] = 1
    sum_freq = len(words) - num + 1
    for key in grams.keys():
        red = 1 # reduction factor equal 1 if no '_' is present
        if '_' in key: red = 2
        grams[key] = round(math.log(grams[key] / (red * sum_freq)), 3) #normalizing by dividing by total no of n-grams for that corpus and taking log                                             
    grams = sorted(grams.items(), key= lambda x : x[1], reverse = True) 
    #print (grams)
    final_grams = [] # contains a list of top k n-grams in a given language 
    log_probs = [] # contains logprobs corresponding to each n-gram
    for i in range(len(grams)):
        final_grams.append(grams[i][0])
        log_probs.append(grams[i][1])
    return final_grams, log_probs

# Calculate scores
def matching_score_2(test_grams, grams_list, n): # n helps us know whether it is bigram, trigram or quadgram
    dist = {lang: 0 for lang in lang_list} # distance corresponding to each language
    for gram in test_grams[0]:
        for lang in grams_list.keys():
            idx_2 = test_grams[0].index(gram)
            if gram in n_grams[n][lang][0] : 
                idx = n_grams[n][lang][0].index(gram)
                dist[lang] += abs(n_grams[n][lang][1][idx] - test_grams[1][idx_2])
            else: # gram is not present in that language's corpus
                dist[lang] += abs(test_grams[1][idx_2])
                # penalty term
    return dist 


def language_identify_2(file_address, st): # argument 'st' denotes whether you are uploading a file or directly copying text
    test_bigrams = []
    test_trigrams = []
    test_quadgrams = []
    test_file = []
    if st == 'file': # If you are copying a file address
        temp = file_address
        with open(temp, 'r', errors = 'ignore') as fname: # some characters throw an error with 'utf-8'
            file_address = fname.read()
    #print (file_address) 
    test_bigrams = create_n_grams(file_address, 2, k)
    test_trigrams = create_n_grams(file_address, 3, k)
    test_quadgrams = create_n_grams(file_address, 4, k)
    bi_dist = matching_score_2(test_bigrams, bi_grams, 2) 
    tri_dist = matching_score_2(test_trigrams, tri_grams, 3)
    quad_dist = matching_score_2(test_quadgrams, quad_grams, 4) 
    #print (bi_dist, tri_dist)
    final_dist = {}
    for lang in bi_dist.keys():
        final_dist[lang] =bi_dist[lang] + tri_dist[lang] + quad_dist[lang]
    sum_dist = 1
    for dist in final_dist.values():
        sum_dist += dist
    for lang in final_dist.keys():
        final_dist[lang] /= sum_dist
    dist_list = sorted(final_dist.items(), key= lambda x:x[1])     
    #print (dist_list)    
    # print ('Predicted language :' + dist_list[0][0] + '\n')

    for value in final_dist.values():
             scores_unstructured.append(value)

    for lang in final_dist.keys():
            scores_structured[lang].append(final_dist[lang])

    dist_list_pairs = sorted(final_dist.items())
    # Use bounds specifuc to a language
    # for lang in distribution_bounds.keys():
    #             if  dist_list[0][0] == lang:
    #                            lower_score_bound =  distribution_bounds[lang]
    #                            if   dist_list_pairs[0][1]  < 0.075 or dist_list_pairs[0][1] > 0.125:
    #                                          return "Other"
                                                   
    if dist_list[0][1] < 0.075 or dist_list[0][1] > 0.125:
             return "Other"
    else:
             return dist_list[0][0]

def save_model(model, filename):
    """
    Save a model to a file using pickle.

    Args:
    model: The model to save.
    filename (str): The filename to save the model to.
    """
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

def load_model(filename):
    """
    Load a model from a file using pickle.

    Args:
    filename (str): The filename to load the model from.

    Returns:
    The loaded model.
    """
    with open(filename, 'rb') as f:
        model = pickle.load(f)
    return model

def filter_sentences(text_lines, min_length=5):
    filtered_sentences = []
    for sentence in text_lines:
        # Remove leading and trailing whitespace
        text = str(sentence.strip())
        text  = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = " ".join([st for st in text.split(" ") if st != ''])
        # Check if the sentence is empty or too short
        
        if len(text.split(' ')) >= min_length:
                    # Remove unusual characters using regular expressions  
                    #print(text.split(' ')) 
                    filtered_sentences.append(text)
        else:
            continue
    
    return filtered_sentences  

def plot_structured(datapoints, fig_name):
            
            # Plotting
            plt.figure(figsize=(8, 6))

            statistics = {}
            for model, data in datapoints.items():
                    avg = np.mean(data)
                    std_dev = np.std(data)
                    var = np.var(data)
                    statistics[model] = {'mean': avg, 'std_dev': std_dev, 'variance': var}

            # Extract the statistics for plotting
            models = list(statistics.keys())
            means = [statistics[model]['mean'] for model in models]
            std_devs = [statistics[model]['std_dev'] for model in models]
            variances = [statistics[model]['variance'] for model in models]   
            print(statistics)     

            # Create the plot
            fig, ax = plt.subplots()

            # Plot the mean as bars
            x = np.arange(len(models))
            bars = ax.bar(x, means, width=0.4, label='Mean', color='lightblue')

            # Plot the standard deviation and variance as box and whisker plots
            data_for_boxplot = [datapoints[model] for model in models]
            boxplot = ax.boxplot(data_for_boxplot, positions=x, widths=0.3, patch_artist=True)

            # Customize box plot colors
            for patch, color in zip(boxplot['boxes'], ['lightgreen', 'lightgreen', 'lightgreen']):
                patch.set_facecolor(color)

            # Add some text for labels, title, and custom x-axis tick labels, etc.
            ax.set_xlabel('Languages')
            ax.set_ylabel('Values')
            ax.set_title('Statistics of Models')
            ax.set_xticks(x)
            ax.set_xticklabels(models)
            ax.legend()

            # Add labels for bars
            def autolabel(rects):
                for rect in rects:
                    height = rect.get_height()
                    ax.annotate(f'{height:.2f}',
                                xy=(rect.get_x() + rect.get_width() / 2, height),
                                xytext=(0, 3),  # 3 points vertical offset
                                textcoords="offset points",
                                ha='center', va='bottom')

            autolabel(bars)
            fig.tight_layout()
            plt.savefig(fig_name)
            plt.show()

def plot_unstructured(datapoints, fig_name):
        # Plotting
        plt.figure(figsize=(8, 6))
        # Scatter plot
        plt.scatter(range(len(datapoints)), datapoints, c='gray', alpha=0.5, s=5) 
        # KDE plot
        sns.kdeplot(x=range(len(datapoints)), y=datapoints, cmap='viridis', shade=True, bw_adjust=0.5)
        # Labels and title
        plt.xlabel('N-gram Model')
        plt.ylabel('Scores')
        plt.title('Unstructured scores from the models')
    
        # Legend
        plt.legend()
        # Grid
        plt.grid(True)

        plt.savefig(fig_name)

def Remove_Layer(sentence):
      text = ''
      # return text and urls

      # check single or double character
      pattern = r'^\s*(\S{1,2}(\s+\S{1,2})*)?\s*$'
      match = re.fullmatch(pattern, sentence)
      if match:
            return ''
      
      else: 
            return sentence 
            

                             

def evaluate(path_to_data):
        for item in os.listdir(path_to_data):
            # Construct the full path to the current item
            item_path = os.path.join(path_to_data, item)
            # Check if the current item is a directory
            if os.path.isdir(item_path):
                # Loop through the contents of the subfolder
                for sub_item in os.listdir(item_path):
                    # Construct the full path to the sub-item
                    sub_item_path = os.path.join(item_path, sub_item)

                    # Check if the sub-item is a file or directory
                    if os.path.isdir(sub_item_path):
                        lang = item_path.split("/")[-1]
                        files_manager = {}
                        for file_name in os.listdir(sub_item_path):
                                file_path = os.path.join(sub_item_path, file_name)

                                # Check if the current item is a file
                                if os.path.isfile(file_path):
                                            #print("File:", file_path)
                                            if 'train' in file_name:
                                                    files_manager['train_file'] = file_path
                                            elif 'dev' in file_name:
                                                    files_manager['dev_file'] = file_path
                                            elif 'test' in file_name:
                                                    files_manager['test_file'] = file_path  
                                            else:
                                                    print("This file is unknow")                              

                        # Dev
                        if  files_manager['test_file']:    
                                    corpus   = open(files_manager['test_file'], "r")
                                    corpus = corpus.readlines()
                                    for line in corpus:
                                                 l_identified = language_identify_2(line, 'string')
                                                 if l_identified == lang:
                                                            evals[lang][0] +=1
                                    evals[lang][1] = len(corpus)

                        # Dev
                        if  files_manager['dev_file']:    
                                    corpus   = open(files_manager['dev_file'], "r")
                                    corpus = corpus.readlines()
                                    for line in corpus:
                                                 l_identified = language_identify_2(line, 'string')
                                                 if l_identified == lang:
                                                            dev_evals[lang][0] +=1
                                    dev_evals[lang][1] = len(corpus)            
        return evals

In [None]:
eval_data_root_path = "" # path to test data 
train_data_root_path = "" # when creating new models
models_root_path = '../../../../../../../ext_data/thapelo/All-Grams-Ablation-Models/9/'  # map to folder containing the models in repo
bi_sorted        = "" # path to where output should be stored
n = [2,3,4]  # Here we are choosing bigrams,trigrams and quadgrams; change this value to get n-grams with a particular n
k = 50 # Decides how many top n-grams will be used for calculating the distance metric

In [None]:
if __name__ == "__main__":
        bi_grams = {}
        tri_grams = {}
        quad_grams = {}
        lang_list = []
        lang_list = ['ssw', 'eng', 'xho', 'zul', 'tsn', 'af', 'sot', 'tso', 'nso', 'nbl', 'ven']

        n_grams = load_model('../../../../../../../ext_data/thapelo/All-Grams-Ablation-Models/9/all_gram_model.pkl')
        bi_grams = load_model('../../../../../../../ext_data/thapelo/All-Grams-Ablation-Models/9/bigram_model.pkl')
        tri_grams = load_model('../../../../../../../ext_data/thapelo/All-Grams-Ablation-Models/9/trigram_model.pkl')
        quad_grams = load_model('../../../../../../../ext_data/thapelo/All-Grams-Ablation-Models/9/trigram_model.pkl')


        evals = {lang:[0, 0] for lang in lang_list}
        dev_evals = {lang:[0, 0] for lang in lang_list}
        # evaluation_scores = evaluate(eval_data_root_path)     # calculate evaluation scores


        # # capture scores
        scores_unstructured  = []
        scores_structured    = {lang: []  for lang in ['ssw', 'eng', 'xho', 'zul', 'tsn', 'af', 'sot', 'tso', 'nso', 'nbl', 'ven']}

        # Loop through folfer co

        # Test sentences
        c_root_folder = ''   # add path to data
        for root, dirs, files in os.walk(c_root_folder):
                for file in files:
                        # Construct the full path to the file
                        text_path = os.path.join(root, file)
                        
                        # Open the file
                        with open(text_path, 'r') as f:
                            # Read the contents of the file
                            sentences = f.readlines()

                        # filter sentences
                        filtered_sentences = filter_sentences(sentences)

                        # Identify language for each sentence
                        for sentence in filtered_sentences:
                            # identified_language = identify_language(sentence, language_models)
                            if len("".join(filter(lambda x: not x.isdigit(), sentence)).lower().split()) > 5:
                                        sentence = Remove_Layer(sentence)
                                        if sentence != '':
                                                identified_language = language_identify_2("".join(filter(lambda x: not x.isdigit(), sentence)).lower(), 'string')
                                                if identified_language == 'nso':
                                                        with open(bi_sorted + 'n_gram_filtered_nso.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')
                                                elif  identified_language == 'tsn':
                                                    with open(bi_sorted + 'n_gram_filtered_tsn.txt', 'a') as tsn_file:
                                                                        tsn_file.write(sentence + '\n') 
                                                elif identified_language == "sot":
                                                    with open(bi_sorted + 'n_gram_filtered_sot.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')
                                                elif identified_language == 'xho':
                                                        with open(bi_sorted + 'n_gram_filtered_xho.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')   
                                                elif identified_language == 'zul':
                                                    with open(bi_sorted + 'n_gram_filtered_zul.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')   
                                                elif identified_language == "ssw":
                                                    with open(bi_sorted + 'n_gram_filtered_ssw.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')   
                                                elif identified_language == "ven":
                                                    with open(bi_sorted + 'n_gram_filtered_ven.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n') 
                                                elif identified_language == 'tso':
                                                        with open(bi_sorted + 'n_gram_filtered_tso.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')   
                                                elif identified_language == 'af':
                                                        with open(bi_sorted + 'n_gram_filtered_af.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')                                                                                                                                   
                                                elif identified_language == 'nbl':
                                                    with open(bi_sorted + 'n_gram_filtered_nbl.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')
                                                elif identified_language == 'eng':
                                                    with open(bi_sorted + 'n_gram_filtered_en.txt', 'a') as n_file:
                                                                        n_file.write(sentence + '\n')  
                                                                            
                                                else:
                                                    continue 
