# Imports

In [1]:
import pandas as pd
from constants import LABEL_COLUMN, TEXT_COLUMN, TRAINING_DATA_PATH, EMBEDDING_SIZE, ALPHABET_COLUMN, ALPHABETS, SMALL_ALPHABETS, TINY_ALPHABETS
import matplotlib.pyplot as plt

import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# Data

In [2]:
df = pd.read_csv("data/train_data_with_embedding_per_column.csv")

# Replace label NaN with the string "nan" (the string "nan" is interpreted as a NaN value by pandas)
df["Label"].replace(to_replace=np.nan, value="nan", inplace=True)

# Remove label with only 1 occurence
df = df.groupby("Label").filter(lambda x: len(x) > 1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Label"].replace(to_replace=np.nan, value="nan", inplace=True)


In [3]:
print(df[["ID", "Label", "Text"]].info())

<class 'pandas.core.frame.DataFrame'>
Index: 38850 entries, 0 to 38853
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      38850 non-null  int64 
 1   Label   38850 non-null  object
 2   Text    38850 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None


In [4]:
def detect_alphabet(text):
    detected = ""
    
    for char in text:
        char_code = ord(char)
        
        for alphabet, (start, end) in ALPHABETS.items():
            if start <= char_code <= end:
                detected = alphabet
    
    return detected if detected else "Inconnu"

df[ALPHABET_COLUMN] = df[TEXT_COLUMN].apply(detect_alphabet)

In [5]:
df[ALPHABET_COLUMN].value_counts()

Alphabet
Latin         28848
Cyrillic       3259
Arabic         2413
Devanagari     1577
Inconnu        1247
Chinese         523
Hebrew          228
Georgian        204
Greek           200
Gujarati        100
Korean           96
Thai             90
Hiragana         50
Katakana         15
Name: count, dtype: int64

In [23]:
df[df[ALPHABET_COLUMN].isin(TINY_ALPHABETS)]["ID"].count()

np.int64(351)

In [6]:
# Separating the data into training and testing data
X_train, X_test = train_test_split(df, 
                                    test_size=0.2, 
                                    stratify=df[LABEL_COLUMN],
                                    random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (31080, 518)
Test shape: (7770, 518)


# Training for alphabet with up to 5 different languages

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

small_classifiers = {}

for small_alphabet in SMALL_ALPHABETS:

    X_train_small = X_train[X_train[ALPHABET_COLUMN] == small_alphabet]
    X_test_small = X_test[X_test[ALPHABET_COLUMN] == small_alphabet]

    classifier = RandomForestClassifier()
    classifier.fit(X_train_small[[f"embedding_{i}" for i in range(EMBEDDING_SIZE)]], X_train_small[LABEL_COLUMN])
    
    small_classifiers[small_alphabet] = classifier


# Test

In [59]:
preds = pd.DataFrame(X_test[["ID", "Text"]], columns=["ID", "Text"])
preds["Label"] = None

labels = X_test[LABEL_COLUMN]

# Alphabet with only one language
tiny_alphabet_matching = { "Thai":"tha", "Hiragana": "jpn", "Gujarati":"guj", "Korean": "kor","Katakana": "jpn" }
for tiny_alphabet in TINY_ALPHABETS:
    mask_tiny = X_test[ALPHABET_COLUMN] == tiny_alphabet
    preds.loc[mask_tiny, "Label"] = tiny_alphabet_matching[tiny_alphabet]

mask_tiny = X_test[ALPHABET_COLUMN].isin(TINY_ALPHABETS)
print("Accuracy Tiny:", (preds.loc[mask_tiny, "Label"] == labels.loc[mask_tiny]).mean())
    
# Alphabet with up to 5 different languages
for small_alphabet in SMALL_ALPHABETS:
    classifier = small_classifiers[small_alphabet]
    mask_small = X_test[ALPHABET_COLUMN] == small_alphabet
    preds.loc[mask_small, "Label"] = classifier.predict(X_test.loc[mask_small, [f"embedding_{i}" for i in range(EMBEDDING_SIZE)]])

mask_small = X_test[ALPHABET_COLUMN].isin(SMALL_ALPHABETS)
print("Accuracy Small:", (preds.loc[mask_small, "Label"] == labels.loc[mask_small]).mean())

# Alphabet with more than 5 different languages
predictions = pd.read_csv("data/test_predictions.csv")
mask = ~X_test[ALPHABET_COLUMN].isin(TINY_ALPHABETS + SMALL_ALPHABETS)
print(preds.merge(predictions[['Text', 'Prediction']], on='Text', how='inner')["Prediction"])
preds.loc[mask, "Label"] = preds.merge(predictions[['Text', 'Prediction']], on='Text', how='inner')["Prediction"]
print((preds.loc[mask, "Label"] == labels.loc[mask]).mean())

# Accuracy
print("Accuracy:", (preds["Label"] == labels).mean())

# f1-score
from sklearn.metrics import f1_score

print("f1-score:", f1_score(preds["Label"], labels, average="weighted"))


"""
cm = confusion_matrix(preds["Label"], labels)

# Plot the confusion matrix
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt="d", cmap="viridis", xticklabels=classifier.classes_, yticklabels=classifier.classes_)
plt.xlabel("Prediction")
plt.ylabel("True value")
plt.title(f"Confusion matrix for the {small_alphabet} alphabet")
plt.show()
"""


Accuracy Tiny: 1.0
Accuracy Small: 0.8968253968253969
Series([], Name: Prediction, dtype: object)
0.0
Accuracy: 0.023423423423423424


ValueError: Classification metrics can't handle a mix of unknown and multiclass targets