# Quantum-Enhanced Bigram Language Identifier

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q qiskit qiskit-aer scipy pandas numpy faker
print("All packages installed.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hAll packages installed.


In [3]:
import pandas as pd
import numpy as np
import re
import math
import time
from collections import defaultdict
from typing import List, Tuple
import logging

# Required for the Novel Training Approach
from qiskit import QuantumCircuit
from qiskit.circuit import ParameterVector
from qiskit.quantum_info import Statevector
import scipy.optimize as optimize
from faker import Faker

# Required for the Evaluation Approach
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report , accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

Load Dataset and Build Language Models

Loading the bigram CSV and creating both word-bigram and character-bigram models. Character bigrams are crucial for discriminating languages like Twi that have unique orthographic patterns.

In [4]:
# 1. Load Data
df = pd.read_csv("/content/drive/MyDrive/bigrams.csv")

# Feature Extraction Using Word and Character Bigrams
This code builds word-level and character-level bigram language models for Twi, English, and French.
It uses smoothed log-probabilities to score how likely a sentence belongs to each language.
Character bigrams help handle short sentences and unseen words.
Each sentence is converted into a compact numerical feature vector.
These features are later used for fast and accurate language identification.

In [5]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FeatureExtractor:
    def __init__(self, alpha: float = 0.1):
        self.alpha = alpha
        self.languages = ["twi", "eng", "fra"]
        self.word_models = {}
        self.char_models = {}

    def build_models(self, df: pd.DataFrame):
        logger.info("Building word & character bigram models...")

        for lang_id, lang in enumerate(self.languages, start=1):
            lang_df = df[df["lang_id"] == lang_id]

            word_counts = dict(zip(lang_df["ngram"], lang_df["count"]))
            total_words = sum(word_counts.values())
            vocab_words = len(word_counts)
            self.word_models[lang] = (word_counts, total_words, vocab_words)

            char_counts = defaultdict(int)
            for bg, c in word_counts.items():
                for w in bg.split():
                    w = "^" + w.lower() + "$"
                    for i in range(len(w) - 1):
                        char_counts[w[i:i+2]] += c

            self.char_models[lang] = (
                dict(char_counts),
                sum(char_counts.values()),
                len(char_counts)
            )

        logger.info("Models built successfully.")

    def _word_score(self, text: str, lang: str) -> float:
        model, total, vocab = self.word_models[lang]
        words = re.findall(r"\w+", text.lower())
        if len(words) < 2:
            return -20.0

        score = 0.0
        for i in range(len(words) - 1):
            bg = f"{words[i]} {words[i+1]}"
            score += math.log((model.get(bg, 0) + self.alpha) /
                              (total + self.alpha * vocab))
        return score / (len(words) - 1)

    def _char_score(self, text: str, lang: str) -> float:
        model, total, vocab = self.char_models[lang]
        words = re.findall(r"\w+", text.lower())
        if not words:
            return -20.0

        text = "^" + "^".join(words) + "$"
        score = 0.0
        for i in range(len(text) - 1):
            bg = text[i:i+2]
            score += math.log((model.get(bg, 0) + self.alpha) /
                              (total + self.alpha * vocab))
        return score / (len(text) - 1)

    def extract(self, text: str) -> np.ndarray:
        feats = []
        for lang in self.languages:
            feats.append(self._word_score(text, lang))
        for lang in self.languages:
            feats.append(self._char_score(text, lang))
        return np.array(feats)

# Quantum Weight Optimization Using Variational Circuits

This class uses a variational quantum circuit to learn optimal weights for classical language features.
Each circuit layer applies parameterized rotation gates followed by entangling gates.
Quantum measurement probabilities are mapped to feature weights and normalized.
These weights are trained using a classical optimizer to minimize cross-entropy loss.
The learned weights are later used for fast, classical language identification inference.

In [6]:

class QuantumWeightOptimizer:
    def __init__(self, num_qubits=4, layers=4):
        self.num_qubits = num_qubits
        self.layers = layers
        # Fix: Initialize ParameterVector with num_qubits * layers * 3 parameters
        # as each qubit gets RX, RY, RZ per layer.
        self.params = ParameterVector("θ", num_qubits * layers * 3)
        self.circuit = self._build_circuit()

    def _build_circuit(self):
        qc = QuantumCircuit(self.num_qubits)
        p = 0
        for _ in range(self.layers):
            for q in range(self.num_qubits):
                qc.rx(self.params[p], q); p += 1
                qc.ry(self.params[p], q); p += 1
                qc.rz(self.params[p], q); p += 1
            for q in range(self.num_qubits - 1):
                qc.cx(q, q + 1)
        return qc

    def get_weights(self, param_values):
        state = Statevector.from_label("0" * self.num_qubits)
        state = state.evolve(
            self.circuit.assign_parameters(dict(zip(self.params, param_values)))
        )
        probs = state.probabilities()
        weights = probs[:6]
        return weights / np.sum(weights)

    def train(self, X: np.ndarray, y: np.ndarray):
        logger.info("Training quantum weight optimizer...")

        def loss_fn(params):
            weights = self.get_weights(params)

            # Corrected logic for logits calculation
            weighted_feats = X * weights  # Element-wise multiplication for (num_samples, 6)
            # Reshape (num_samples, 6) -> (num_samples, 2, 3) and sum over the 2nd axis to get (num_samples, 3)
            logits = weighted_feats.reshape(X.shape[0], 2, 3).sum(axis=1)

            exp = np.exp(logits - logits.max(axis=1, keepdims=True))
            probs = exp / exp.sum(axis=1, keepdims=True)
            return -np.mean(np.log(probs[np.arange(len(y)), y]))

        init = np.random.uniform(-np.pi, np.pi, len(self.params))
        result = optimize.minimize(
            loss_fn, init, method="COBYLA", options={"maxiter": 120}
        )

        self.best_params = result.x
        self.weights = self.get_weights(self.best_params)
        logger.info(f"Quantum training complete. Weights: {self.weights}")

# Hybrid Quantum–Classical Language Prediction

This class performs fast language prediction using quantum-learned feature weights.
Each input sentence is converted into word and character bigram features.
The features are weighted using parameters learned by the quantum optimizer.
Scores are aggregated per language to produce final predictions.
This design enables real-time inference while preserving quantum optimization benefits.

In [7]:
class HybridLanguageIdentifier:
    def __init__(self, extractor: FeatureExtractor, weights: np.ndarray):
        self.extractor = extractor
        self.weights = weights
        self.languages = ["twi", "eng", "fra"]

    def predict(self, text: str) -> str:
        feats = self.extractor.extract(text)
        scores = feats * self.weights
        scores = scores.reshape(2, 3).sum(axis=0)
        return self.languages[np.argmax(scores)]

    def predict_batch(self, texts: List[str]) -> List[str]:
        return [self.predict(t) for t in texts]

In [8]:
def main():
    # 2. Build Models
    extractor = FeatureExtractor(alpha=0.1)
    extractor.build_models(df)

    # 3. Split Data: 80% Training, 20% Testing (on the actual dataset)
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["lang_id"], random_state=42)
    X_train_raw = np.vstack([extractor.extract(t) for t in train_df["ngram"]])
    y_train_raw = train_df["lang_id"].values - 1

    # 4. Train Quantum Optimizer
    quantum = QuantumWeightOptimizer(num_qubits=4, layers=4)
    quantum.train(X_train_raw, y_train_raw)

    # Initialize the Hybrid model
    model = HybridLanguageIdentifier(extractor, quantum.weights)

    # --- ACCURACY 1: TESTING ON THE DATASET (CSV) ---
    logger.info("Evaluating on CSV Test Set...")

    # Removed 'random_state=42' as predict_batch does not accept it
    dataset_preds = model.predict_batch(test_df["ngram"].tolist())

    # Convert lang_id (1,2,3) to names ("twi", "eng", "fra") for comparison
    id_to_lang = {1: "twi", 2: "eng", 3: "fra"}
    dataset_true = [id_to_lang[i] for i in test_df["lang_id"]]

    acc_dataset = accuracy_score(dataset_true, dataset_preds)

    # --- ACCURACY 2: TESTING ON GENERATED SENTENCES ---
    logger.info("Evaluating on Generated Sentences...")

    # Initialize generators for different locales
    fake_en = Faker('en_US')
    fake_fr = Faker('fr_FR')
    # Faker doesn't support Twi well, so we use a larger word bank for it
    twi_words = ["medaase", "paa", "kyere", "obi", "nsuo", "akwaaba", "ɛyɛ", "onipa"]

    test_bigrams = []
    gen_true_labels = []
    target_langs = ["twi", "eng", "fra"]
    for _ in range(100000):
        lang_choice = np.random.choice(target_langs)

        if lang_choice == "eng":
            # Generate a random 4-word phrase
            bigram = fake_en.sentence(nb_words=4).replace(".", "")
        elif lang_choice == "fra":
            bigram = fake_fr.sentence(nb_words=4).replace(".", "")
        else:
            bigram = " ".join(np.random.choice(twi_words, 4))

        test_bigrams.append(bigram)
        gen_true_labels.append(lang_choice)

    gen_preds = model.predict_batch(test_bigrams)
    acc_generated = accuracy_score(gen_true_labels, gen_preds)

    # --- FINAL OUTPUT ---
    print("\n" + "="*30)
    print(f"RESULTS")
    print("="*30)
    print(f"Accuracy (CSV Dataset Test): {acc_dataset:.4f}")
    print(f"Accuracy (Generated Sentences): {acc_generated:.4f}")
    print("="*30)

    print("\nDetailed Report (Generated Sentences):")
    print(classification_report(gen_true_labels, gen_preds, zero_division=0))

    # Validation Sample
    val_texts = ["Yɛreyɛ nhyehyɛeɛ sɛ yɛbɛkɔ mmepɔw so ɔsram a ɛdi hɔ yi.", "The weather is nice", "C'est la vie"]
    val_preds = model.predict_batch(val_texts)
    print(f"\nLive Samples: {list(zip(val_texts, val_preds))}")

if __name__ == "__main__":
    main()


RESULTS
Accuracy (CSV Dataset Test): 0.8298
Accuracy (Generated Sentences): 0.7158

Detailed Report (Generated Sentences):
              precision    recall  f1-score   support

         eng       0.56      0.93      0.70     33175
         fra       0.99      0.25      0.40     33378
         twi       0.88      0.97      0.92     33447

    accuracy                           0.72    100000
   macro avg       0.81      0.72      0.67    100000
weighted avg       0.81      0.72      0.67    100000


Live Samples: [('Yɛreyɛ nhyehyɛeɛ sɛ yɛbɛkɔ mmepɔw so ɔsram a ɛdi hɔ yi.', 'twi'), ('The weather is nice', 'eng'), ("C'est la vie", 'fra')]
