In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
igbo_df = pd.read_csv("data/igbo.tsv", sep="\t")
yoruba_df = pd.read_csv("data/yoruba.tsv", sep="\t")
hausa_df = pd.read_csv("data/hausa.tsv", sep="\t")


In [None]:
igbo_df = igbo_df[["tweet"]].rename(columns={"tweet": "text"})
igbo_df["language"] = "igbo"

yoruba_df = yoruba_df[["tweet"]].rename(columns={"tweet": "text"})
yoruba_df["language"] = "yoruba"

hausa_df = hausa_df[["tweet"]].rename(columns={"tweet": "text"})
hausa_df["language"] = "hausa"


In [None]:
df = pd.concat([igbo_df, yoruba_df, hausa_df], ignore_index=True)

print(df.sample(5))
print(df["language"].value_counts())


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)   # remove URLs
    text = re.sub(r"[^a-zA-ZÀ-ſ\s]", "", text)  # keep letters (incl. accents)
    text = re.sub(r"\s+", " ", text).strip()    # remove extra spaces
    return text

df["clean_text"] = df["text"].apply(clean_text)

# Sanity check
df[["text", "clean_text", "language"]].sample(5)


In [None]:
# Features and labels
X = df["clean_text"]
y = df["language"]

# TF-IDF with character n-grams
vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=2
)

X_tfidf = vectorizer.fit_transform(X)

print("TF-IDF shape:", X_tfidf.shape)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])


In [None]:
# Initialize model
model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

# Train
model.fit(X_train, y_train)

print("Model training complete.")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred = model.predict(X_test)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)

# Plot
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=model.classes_,
    yticklabels=model.classes_
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Language Identification Confusion Matrix")
plt.show()


In [None]:
import joblib

joblib.dump(model, "lid_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
