# Imports

In [1]:
import pandas as pd
from constants import LABEL_COLUMN, TEXT_COLUMN, TRAINING_DATA_PATH, EMBEDDING_SIZE, ALPHABET_COLUMN, ALPHABETS, SMALL_ALPHABETS, TINY_ALPHABETS
import matplotlib.pyplot as plt

import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data

In [3]:
df = pd.read_csv("data/train_data_with_embedding_per_column.csv")

# Replace label NaN with the string "nan" (the string "nan" is interpreted as a NaN value by pandas)
df["Label"].replace(to_replace=np.nan, value="nan", inplace=True)

# Remove label with only 1 occurence
df = df.groupby("Label").filter(lambda x: len(x) > 1)

In [4]:
print(df[["ID", "Label", "Text"]].info())

<class 'pandas.core.frame.DataFrame'>
Index: 38850 entries, 0 to 38853
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      38850 non-null  int64 
 1   Label   38850 non-null  object
 2   Text    38850 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None


In [5]:
def detect_alphabet(text):
    detected = ""
    
    for char in text:
        char_code = ord(char)
        
        for alphabet, (start, end) in ALPHABETS.items():
            if start <= char_code <= end:
                detected = alphabet
    
    return detected if detected else "Inconnu"

df[ALPHABET_COLUMN] = df[TEXT_COLUMN].apply(detect_alphabet)

In [31]:
# Load data
data = pd.read_csv("data/train_predictions.csv")

# Train data
X_train = data[data["Type"] == "Train"]
X_train = pd.merge(X_train, df, on=['Text', 'ID', 'Usage', 'Label'], how="inner")
X_train["Label"].replace(to_replace=np.nan, value="nan", inplace=True)
# Roberta predictions
predictions_roberta = pd.read_csv("data/train_predictions.csv")

# Test predictions
#X_test = pd.merge(pd.DataFrame(test_texts, columns=["Text"]), predictions_roberta, on=["Text"], how="inner")
X_test = data[data["Type"] == "Test"]
X_test = pd.merge(X_test, df, on=['Text', 'ID', 'Usage', 'Label'], how="inner")
X_test["Label"].replace(to_replace=np.nan, value="nan", inplace=True)
X_test["PredictedLabel"].replace(to_replace=np.nan, value="nan", inplace=True)
X_test.rename(columns={"PredictedLabel": "Prediction"}, inplace=True)

# Accuracy with Roberta only
print("Accuracy with Roberta only:", (X_test["Label"] == X_test["Prediction"]).mean())

Accuracy with Roberta only: 0.8367531294360563


# Training for alphabet with up to 5 different languages

In [13]:
small_classifiers = {}

for small_alphabet in SMALL_ALPHABETS:

    X_train_small = X_train[X_train[ALPHABET_COLUMN] == small_alphabet]
    X_test_small = X_test[X_test[ALPHABET_COLUMN] == small_alphabet]

    classifier = RandomForestClassifier()
    classifier.fit(X_train_small[[f"embedding_{i}" for i in range(EMBEDDING_SIZE)]], X_train_small[LABEL_COLUMN])
    
    small_classifiers[small_alphabet] = classifier


# Test

In [33]:
# Alphabet with only one language
preds_tiny_alphabets = {}
weights_tiny_alphabets = {}
tiny_alphabet_matching = { "Thai":"tha", "Hiragana": "jpn", "Gujarati":"guj", "Korean": "kor","Katakana": "jpn" }
for tiny_alphabet in TINY_ALPHABETS:
    preds_tiny_alphabet = X_test[X_test[ALPHABET_COLUMN] == tiny_alphabet]
    preds_tiny_alphabet["Prediction"] = tiny_alphabet_matching[tiny_alphabet]
    preds_tiny_alphabets[tiny_alphabet] = preds_tiny_alphabet
    weights_tiny_alphabets[tiny_alphabet] = len(preds_tiny_alphabet)
 
accuracy_tiny = 0
for key in preds_tiny_alphabets:
    accuracy_tiny += (preds_tiny_alphabets[key]["Label"] == preds_tiny_alphabets[key]["Prediction"]).sum()
print("Accuracy Tiny:", accuracy_tiny / sum(weights_tiny_alphabets.values()))


# Alphabet with up to 5 different languages
preds_small_alphabets = {}
weights_small_alphabets = {}
for small_alphabet in SMALL_ALPHABETS:
    classifier = small_classifiers[small_alphabet]
    preds_small_alphabet = X_test[X_test[ALPHABET_COLUMN] == small_alphabet]
    preds_small_alphabet["Prediction"] = classifier.predict(preds_small_alphabet[[f"embedding_{i}" for i in range(EMBEDDING_SIZE)]])
    preds_small_alphabets[small_alphabet] = preds_small_alphabet
    weights_small_alphabets[small_alphabet] = len(preds_small_alphabet)

accuracy_small = 0
for key in preds_small_alphabets:
    accuracy_small += (preds_small_alphabets[key]["Label"] == preds_small_alphabets[key]["Prediction"]).sum()
    
print("Accuracy Small:", accuracy_small / sum(weights_small_alphabets.values()))

# Alphabet with more than 5 different languages
large_alphabets = ALPHABETS.keys() - set(SMALL_ALPHABETS + TINY_ALPHABETS)
large_alphabets.add("Inconnu")

preds_large_alphabets = {}
weights_large_alphabets = {}
for alphabet in large_alphabets:
    preds_large_alphabet = X_test[X_test[ALPHABET_COLUMN] == alphabet]
    preds_large_alphabets[alphabet] = preds_large_alphabet
    weights_large_alphabets[alphabet] = len(preds_large_alphabet)

accuracy_large = 0
for key in preds_large_alphabets:
    accuracy_large += (preds_large_alphabets[key]["Label"] == preds_large_alphabets[key]["Prediction"]).sum()
print("Accuracy Large:", accuracy_large / sum(weights_large_alphabets.values()))

# Accuracy and f1-score
from sklearn.metrics import f1_score


preds = pd.concat([preds_tiny_alphabets[key] for key in preds_tiny_alphabets] + [preds_small_alphabets[key] for key in preds_small_alphabets] + [preds_large_alphabets[key] for key in preds_large_alphabets])
print("Total accuracy:", (preds["Label"] == preds["Prediction"]).mean())
print("f1-score:", f1_score(preds["Label"], preds["Prediction"], average="weighted"))

Accuracy Tiny: 0.9864864864864865
Accuracy Small: 0.8991596638655462
Accuracy Large: 0.8331127580730545
Total accuracy: 0.8355916892502259
f1-score: 0.8350256693723388
