# Imports

In [61]:
"""Module pour analyser la distribution des données."""
import pandas as pd
import numpy as np
from constants import LABEL_COLUMN, TEXT_COLUMN, TRAINING_DATA_PATH, EMBEDDING_SIZE, ALPHABETS
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

import seaborn as sns
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


# Représentation des données

In [74]:
# Without embeddings
df_base = pd.read_csv(TRAINING_DATA_PATH)

In [77]:
# With embeddings
df = pd.read_csv("data/train_data_with_embedding_per_column.csv")

In [78]:
# Replace label NaN with the string "nan" (the string "nan" is interpreted as a NaN value by pandas)
df["Label"].replace(to_replace=np.nan, value="nan", inplace=True)
print("NaN values:", int(df_base["Label"].isna().sum()))

NaN values: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Label"].replace(to_replace=np.nan, value="nan", inplace=True)


## Script de détection de l'alphabet

In [79]:
def detect_alphabet(text):
    detected = ""
    
    for char in text:
        char_code = ord(char)
        
        for alphabet, (start, end) in ALPHABETS.items():
            if start <= char_code <= end:
                detected = alphabet
    
    return detected if detected else "Inconnu"

df["Alphabet"] = df[TEXT_COLUMN].apply(detect_alphabet)


In [80]:
chosen_alphabet = "Latin"
latin = df[df["Alphabet"] == chosen_alphabet]
latin.head()

Unnamed: 0,ID,Usage,Text,Label,Embedding,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_503,embedding_504,embedding_505,embedding_506,embedding_507,embedding_508,embedding_509,embedding_510,embedding_511,Alphabet
0,136,Public,Finalment Atena le recibe en l'acropoli d'Ate...,arg,[ 0.00498647 -0.0467922 0.10376293 -0.003088...,0.004986,-0.046792,0.103763,-0.003088,-0.017613,...,0.033562,-0.002397,0.02715,-0.066053,0.065953,0.013858,0.037078,0.055359,-0.04198,Latin
1,62,Public,Jane Laffort fille de Joseph Laffort et d' Ang...,lat,[-1.12566622e-02 1.33811096e-02 -6.55803457e-...,-0.011257,0.013381,-0.06558,-0.016295,-0.04378,...,-0.054325,0.000371,0.02689,-0.040526,-0.012469,-0.022179,0.03359,-0.026046,-0.041936,Latin
3,40,Public,Mɛniɛ nkùɔ dìì mɔ̀nnì bɛnìtìbɛ̀ kɛ́deè kɛ̀ Nɔ...,tbz,[ 2.71539483e-02 4.05863933e-02 2.17084046e-...,0.027154,0.040586,0.021708,0.018382,-0.058518,...,0.034668,0.044174,-0.000523,-0.022192,0.024736,0.000753,-0.053208,-0.03732,-0.004929,Latin
4,30,Public,Ka go dirisa thekniki yeo ya phetogonepiso Le...,tsn,[ 1.13707362e-02 2.42560823e-02 3.01246773e-...,0.011371,0.024256,0.030125,-0.009485,-0.017779,...,0.009352,0.018744,-0.030698,-0.000581,0.006189,0.000829,-0.018835,0.008916,-0.000244,Latin
5,136,Public,Mashahidi walisema kwamba waliona Dan karibu f...,swc,[-0.0152249 0.0260192 -0.02207349 -0.007869...,-0.015225,0.026019,-0.022073,-0.007869,-0.073699,...,0.026933,-0.014413,0.033498,0.034517,0.010144,0.040289,0.011391,-0.015848,-0.019345,Latin


# Multinomial Naïve Bayes algorithm

Embeddings: CountVectorizer()
- Tokenization: done by splitting on whitespace and removing punctuation.
- Vocabulary: Each unique token is assigned a unique integer index.
- Encoding: Each document/sentance is represented as a vector of token counts. The length of the vector is equal to the size of the vocabulary.

Bayes algorithm
- Bayes' theorem: posterior probability $P(C|X) = \frac{P(X|C) \cdot P(C)}{P(X)}$ with $C$ the target class/label and $X$ a token.
- Training Phase: Compute the prior probability $P(C)$ for each class $C$ (i.e the frequency of each class in the training data). Then, compute the likelihood $P(X|C)$ for each feature (token) given each class (i.e the frequency of each word in documents of class $C$ normalized by the total number of words in class $C$.
- Prediction Phase: For a given document/sentance, compute the posterior probability for each class using Bayes' theorem. Assign the document to the class with the highest posterior probability.

In [81]:
x = np.array(latin["Text"])
y = np.array(latin["Label"])

cv = CountVectorizer()
X = cv.fit_transform(x)

print("Vocabulary size:", len(cv.vocabulary_))


Vocabulary size: 231240


## Split dataset

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)


Train shape: (23081, 231240)
Test shape: (5771, 231240)
Train labels shape: (23081,)
Test labels shape: (5771,)


## Training

In [83]:
model = MultinomialNB()
model.fit(X_train, y_train)

## Test

In [84]:
print("Accuracy:", model.score(X_test, y_test))

for i in range(10):
    print("Predicted:", model.predict(X_test[i])[0], "; Actual:", y_test[i], "; Text:", x[i])

Accuracy: 0.7383469069485358
Predicted: sun ; Actual: sun ; Text: Finalment  Atena le recibe en l'acropoli d'Atenas y organiza un chudicio formal d'o caso debant d'o Areopago  un tribunal formau por dotze chueces aticos.
Predicted: ksh ; Actual: ksh ; Text: Jane Laffort fille de Joseph Laffort et d' Angelique Gogiash est nil le 2 Fevrier 1887 fiel baptiste le 13 Febier 1887. Parrain-Pierre Gogioth  Marraine-Sussime Neveu. sign J. F. Chambers. s.j.
Predicted: hyw ; Actual: hyw ; Text: Mɛniɛ nkùɔ dìì mɔ̀nnì bɛnìtìbɛ̀ kɛ́deè  kɛ̀ Nɔwee mmɔkɛ yɛbie nsikɔusìtɑ̃ɑ̃ti nɛ̀ sipísìnùmmù (350) ndi 
Predicted: hmo ; Actual: hmo ; Text: Ka go dirisa thekniki yeo ya phetogonepiso  Lentsoane o rata go bontsha
Predicted: smo ; Actual: smo ; Text: Mashahidi walisema kwamba waliona Dan karibu fasi yote ndani ya muji.
Predicted: pcd ; Actual: pcd ; Text: Kel propi karu tradu di lugar di asidenti aprosimadamenti 12:00 GMT na mésmu dia.
Predicted: ndo ; Actual: ndo ; Text: 60. (es) Texto d'o Real Decreto de

# Other model

In [None]:
embeddings = latin[[f"embedding_{i}" for i in range(EMBEDDING_SIZE)]]
labels = latin[LABEL_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, 
                                                    test_size=0.2, 
                                                    random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)
