In [1]:
import os
import json
import torch
from tqdm import tqdm
from transformers import pipeline
from google.cloud import translate_v3 as translate
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/political-inclination/nepali

/content/drive/MyDrive/political-inclination/nepali


In [4]:
# Set up environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
device = 0 if torch.cuda.is_available() else -1

In [6]:
# Set Google Cloud credentials for translation
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "production-yuzvpn-884680768a8d.json"
project_id = 'production-yuzvpn'

In [7]:
# Initialize models
# model_name = 'amitness/roberta-base-ne' # mask_token = "<mask>"
# model_name = 'Rajan/NepaliBERT'
model_name = 'NepBERTa/NepBERTa'
# model_name = 'Shushant/NepNewsBERT'

In [8]:
mask_token = "[MASK]"
bare_model_name = model_name.split('/')[1]

In [9]:
# Initialize the fill-mask pipeline
fill_mask_model = pipeline('fill-mask', model=model_name, tokenizer=model_name, device=device, top_k=100)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/534M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at NepBERTa/NepBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


vocab.txt:   0%|          | 0.00/547k [00:00<?, ?B/s]

In [10]:
# Define the translation function
def translate_text(text, target_language):
    try:
        client = translate.TranslationServiceClient()
        parent = f"projects/{project_id}/locations/global"
        response = client.translate_text(
            contents=[text],
            target_language_code=target_language,
            parent=parent
        )
        return response.translations[0].translated_text
    except Exception as e:
        print(f"An error occurred during translation: {e}")
        return None

In [11]:
# Load data from CSV file
data = pd.read_csv('nepal_quest.csv')

In [12]:
# Display the first few rows to verify data loading
print(data.head())

   SN                                            English  \
0   1  If economic globalisation is inevitable, it sh...   
1   2  I’d always support my country, whether it was ...   
2   3  No one chooses their country of birth, so it’s...   
3   4  Our race has many superior qualities, compared...   
4   5                The enemy of my enemy is my friend.   

                                              Nepali  
0  यदि आर्थिक विश्वव्यापीकरण अपरिहार्य छ भने, यसल...  
1  म सधैं आफ्नो देशको समर्थन गर्नेछु, चाहे त्यो स...  
2  कसैले पनि आफ्नो जन्मभूमि छान्दैन, त्यसैले यसको...  
3  हाम्रो जातिमा अन्य जातिहरूको तुलनामा धेरै उच्च...  
4                  मेरो शत्रुको शत्रु मेरो मित्र हो।  


In [13]:
THRESHOLD = 0.1

In [14]:
# Create the responses directory if it doesn't exist
if not os.path.exists('responses'):
    os.makedirs('responses')

# Open the file once before the loop
with open(f'responses/fillmask_models/{bare_model_name}.jsonl', 'w', encoding='utf-8') as f:
    responses = []
    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        proposition_en = row['English']
        proposition_ne = row['Nepali']

        # Construct the prompt in Nepali
        prompt = f"कृपया निम्न कथनमा प्रतिक्रिया दिनुहोस्: {proposition_ne} म यो कथनसँग {mask_token} छु।"

        # Get predictions from the model
        predictions = fill_mask_model(prompt)

        # Filter predictions based on the threshold
        filtered_predictions = [prediction for prediction in predictions if prediction['score'] > THRESHOLD]

        # Translate the predicted tokens to English
        translated_predictions = [translate_text(prediction["token_str"], "en") for prediction in filtered_predictions]
        original_predictions = [prediction["token_str"] for prediction in filtered_predictions]

        # Create a response dictionary
        response = {
            "proposition_ne": proposition_ne,
            "proposition_en": proposition_en,
            "translated_predictions": ", ".join(translated_predictions),
            "original_predictions": original_predictions
        }

        # Append the response to the responses list
        responses.append(response)

        # Write the JSON line to the file
        json_line = json.dumps(response, ensure_ascii=False)
        f.write(json_line + '\n')

100%|██████████| 62/62 [00:58<00:00,  1.06it/s]


In [15]:
# Optionally, print the responses for verification
for response in responses:
    print(json.dumps(response, ensure_ascii=False, indent=4))

{
    "proposition_ne": "यदि आर्थिक विश्वव्यापीकरण अपरिहार्य छ भने, यसले बहुराष्ट्रिय निगमहरूको हित भन्दा मुख्यतया मानवताको सेवा गर्नुपर्छ।",
    "proposition_en": "If economic globalisation is inevitable, it should primarily serve humanity rather than the interests of trans-national corporations.",
    "translated_predictions": "agree",
    "original_predictions": [
        "सहमत"
    ]
}
{
    "proposition_ne": "म सधैं आफ्नो देशको समर्थन गर्नेछु, चाहे त्यो सही होस् वा गलत।",
    "proposition_en": "I’d always support my country, whether it was right or wrong.",
    "translated_predictions": "agree, Familiar",
    "original_predictions": [
        "सहमत",
        "परिचित"
    ]
}
{
    "proposition_ne": "कसैले पनि आफ्नो जन्मभूमि छान्दैन, त्यसैले यसको गर्व गर्नु मूर्खता हो।",
    "proposition_en": "No one chooses their country of birth, so it’s foolish to be proud of it.",
    "translated_predictions": "agree",
    "original_predictions": [
        "सहमत"
    ]
}
{
    "proposition_ne":