<a href="https://colab.research.google.com/github/ephrem823/ML/blob/main/machine%20translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install transformers sentencepiece scikit-learn pandas torch --quiet

In [28]:
import pandas as pd
import numpy as np
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [29]:
file_path = "/content/amharic_english_mt_10000.json"

df = pd.read_json(file_path, encoding="utf-8")
df.head()


Unnamed: 0,src,tgt
0,መጓጓዣ እየተማርኩ ነው,I am learning the travel
1,ስራ በዛሬ ቀን ተከናውኗል,The work happened today
2,ገበሬ ተጠቃሚ ነው,The farmer is useful
3,እባክህ ምግብ አሳየኝ,Please show me the food
4,ገበሬ በዛሬ ቀን ተከናውኗል,The farmer happened today


In [49]:
def clean(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

def normalize(text):
    return unicodedata.normalize("NFKC", text)

# Apply cleaning and normalization to src and tgt
df["src_clean"] = df["src"].apply(lambda x: " ".join(normalize(clean(x)).split()))
df["tgt_clean"] = df["tgt"].apply(lambda x: " ".join(normalize(clean(x)).split()))

# Join tokens for TF-IDF
df["src_preprocessed"] = df["src_clean"]
df["tgt_preprocessed"] = df["tgt_clean"]

df.head(2)

Unnamed: 0,src,tgt,src_clean,tgt_clean,src_preprocessed,tgt_preprocessed
0,መጓጓዣ እየተማርኩ ነው,I am learning the travel,መጓጓዣ እየተማርኩ ነው,I am learning the travel,መጓጓዣ እየተማርኩ ነው,I am learning the travel
1,ስራ በዛሬ ቀን ተከናውኗል,The work happened today,ስራ በዛሬ ቀን ተከናውኗል,The work happened today,ስራ በዛሬ ቀን ተከናውኗል,The work happened today


In [31]:
vectorizer_src = TfidfVectorizer()
vectorizer_tgt = TfidfVectorizer()

tfidf_src = vectorizer_src.fit_transform(df["src_preprocessed"])
tfidf_tgt = vectorizer_tgt.fit_transform(df["tgt_preprocessed"])

In [32]:
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [44]:
def translate_nllb(text, src_lang, tgt_lang):
    # Set source language
    tokenizer.src_lang = src_lang

    # Encode input
    encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)

    # Generate translation
    forced_bos_token_id = tokenizer.get_lang_id(tgt_lang)  # <-- fix here
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=forced_bos_token_id,
        max_length=200,
        num_beams=5
    )

    # Decode output
    decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return decoded[0]


In [53]:
def translate_nllb(text, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=200,
        num_beams=5
    )
    decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return decoded[0]

user_question = input("Enter your question (Amharic or English): ")

if any('\u1200' <= c <= '\u137F' for c in user_question):
    lang = "src"
    tfidf = tfidf_src
    vectorizer = vectorizer_src
    src_lang_code = "amh_Ethi"
    tgt_lang_code = "eng_Latn"
else:
    lang = "tgt"
    tfidf = tfidf_tgt
    vectorizer = vectorizer_tgt
    src_lang_code = "eng_Latn"
    tgt_lang_code = "amh_Ethi"

user_vec = vectorizer.transform([user_question])
scores = cosine_similarity(user_vec, tfidf)[0]
best_index = np.argmax(scores)


similarity_threshold = 0.5
model_name_nllb = "facebook/nllb-200-distilled-600M"
tokenizer_nllb = AutoTokenizer.from_pretrained(model_name_nllb)
model_nllb = AutoModelForSeq2SeqLM.from_pretrained(model_name_nllb)
model_nllb = model_nllb.to(device)

def translate_nllb_corrected(text, src_lang, tgt_lang, tokenizer_nllb, model_nllb):
    tokenizer_nllb.src_lang = src_lang
    encoded = tokenizer_nllb(text, return_tensors="pt", padding=True, truncation=True).to(device)
    generated_tokens = model_nllb.generate(
        **encoded,
        forced_bos_token_id=tokenizer_nllb.encode(tgt_lang, add_special_tokens=False)[0],
        max_length=200,
        num_beams=5
    )
    decoded = tokenizer_nllb.batch_decode(generated_tokens, skip_special_tokens=True)
    return decoded[0]

if scores[best_index] < similarity_threshold:
    translated_answer = "No Answer"
    matched_src = "No Answer"
    matched_tgt = "No Answer"
else:
    matched_src = df["src"][best_index]
    matched_tgt = df["tgt"][best_index]
    translated_answer = translate_nllb_corrected(df[lang][best_index], src_lang=src_lang_code, tgt_lang=tgt_lang_code, tokenizer_nllb=tokenizer_nllb, model_nllb=model_nllb)

user_df = pd.DataFrame([{
    "User Question": user_question,
    "Detected Language": lang,
    "Matched Amharic": matched_src,
    "Matched English": matched_tgt,
    "Similarity Score": scores[best_index],
    "Translated Answer": translated_answer
}])

user_df.head()

Enter your question (Amharic or English): I am learning the travel


Unnamed: 0,User Question,Detected Language,Matched Amharic,Matched English,Similarity Score,Translated Answer
0,I am learning the travel,tgt,መጓጓዣ እየተማርኩ ነው,I am learning the travel,1.0,መንገዱን እየተማርኩ ነው
