In [1]:
# Install the necessary packages
!pip install -U datasets



In [2]:
from datasets import load_dataset, concatenate_datasets, Dataset
import gc
import json
import pandas as pd
from tqdm.auto import tqdm

In [3]:
languages = {
    "Hindi": {"dataset": ["ravithejads/samvaad-hi-filtered", "HydraIndicLM/hindi_alpaca_dolly_67k"], "lang_code": "hi"},
    "Telugu": {"dataset": ["Telugu-LLM-Labs/telugu_alpaca_yahma_cleaned_filtered_romanized", "Telugu-LLM-Labs/telugu_teknium_GPTeacher_general_instruct_filtered_romanized"], "lang_code": "te"},
    "Marathi": {"dataset": ["Telugu-LLM-Labs/sindhi_alpaca_yahma_cleaned_filtered"], "lang_code": "mr"},
    "Urdu": {"dataset": ["Telugu-LLM-Labs/urdu_alpaca_yahma_cleaned_filtered"], "lang_code": "ur"},
    "Assamese": {"dataset": ["Telugu-LLM-Labs/assamese_alpaca_yahma_cleaned_filtered"], "lang_code": "as"},
    "Konkani": {"dataset": ["Telugu-LLM-Labs/konkani_alpaca_yahma_cleaned_filtered"], "lang_code": "kok"},
    "Nepali": {"dataset": ["Telugu-LLM-Labs/nepali_alpaca_yahma_cleaned_filtered"], "lang_code": "ne"},
    "Sindhi": {"dataset": ["Telugu-LLM-Labs/sindhi_alpaca_yahma_cleaned_filtered"], "lang_code": "sd"},
    "Tamil": {"dataset": ["abhinand/tamil-alpaca"], "lang_code": "ta"},
    "Kannada": {"dataset": ["Tensoic/airoboros-3.2_kn", "Tensoic/gpt-teacher_kn"], "lang_code": "kn"},
    "Malayalam": {"dataset": ["VishnuPJ/Alpaca_Instruct_Malayalam"], "lang_code": "ml"},
    "Gujarati": {"dataset": ["Tensoic/Alpaca-Gujarati"], "lang_code": "gu"},
    "Punjabi": {"dataset": ["HydraIndicLM/punjabi_alpaca_52K"], "lang_code": "pa"},
    "Bengali": {"dataset": ["HydraIndicLM/bengali_alpaca_dolly_67k"], "lang_code": "bn"},
    "Odia": {"dataset": ["OdiaGenAI/Odia_Alpaca_instructions_52k", "OdiaGenAI/gpt-teacher-roleplay-odia-3k"], "lang_code": "or"},
    "English": {"dataset": ["yahma/alpaca-cleaned"], "lang_code": "en"},
}

In [4]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
def translate(text, src_lang, target_lang):
    tokenizer.src_lang = src_lang
    encoded_text = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id(target_lang))
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [6]:
def download_dataset(language):
    # dataset download
    main_dataset = []
    for dataset in languages[language]['dataset']:
        _dataset = load_dataset(dataset, split="train")
        main_dataset.append(_dataset.to_pandas())
    return pd.concat(main_dataset)

In [7]:
def split_translate(language, df, column):
    # Set the source language based on the input language
    src_lang = languages[language]['lang_code']

    # Generate a list of target languages excluding the source language
    target_langs = [value['lang_code'] for key, value in languages.items() if key != language]

    # Calculate the size of each group for translation
    group_size = len(df) // len(target_langs)

    # Translate each group to a different target language
    translated_texts = []
    for i, target_lang in enumerate(target_langs):
        start_index = i * group_size
        # If it's the last group, include all remaining rows
        end_index = (i + 1) * group_size if i != len(target_langs) - 1 else len(df)

        # Extract the text group for translation
        text_group = df[column].iloc[start_index:end_index]

        # Translate each text in the group
        translated_group = [translate(text, src_lang, target_lang) for text in tqdm(text_group, desc=f'Translating to {target_lang}')]
        translated_texts.extend(translated_group)

    # Ensure the translated texts column is the same length as the original DataFrame
    if len(translated_texts) < len(df):
        translated_texts.extend([''] * (len(df) - len(translated_texts)))

    # Add the translated texts as a new column in the DataFrame
    df[f'translated_{column}'] = translated_texts

    return df


In [8]:
df = download_dataset("English")
df

Unnamed: 0,output,input,instruction
0,1. Eat a balanced and nutritious diet: Make su...,,Give three tips for staying healthy.
1,"The three primary colors are red, blue, and ye...",,What are the three primary colors?
2,An atom is the basic building block of all mat...,,Describe the structure of an atom.
3,There are several ways to reduce air pollution...,,How can we reduce air pollution?
4,I had to make a difficult decision when I was ...,,Pretend you are a project manager of a constru...
...,...,...,...
51755,Yes,Text: John went out for a walk with his dog Ro...,You will be given a piece of text about an eve...
51756,True,Text: Michael Jordan is an American former pro...,You will be given a paragraph of text with var...
51757,True,Text: A tree fell over in the wind and caused ...,You will be given a piece of text about an eve...
51758,Backwards,"Steps: ['She takes out her books', 'The teache...",I will give you a list of steps. You need to ...


In [10]:
trans_df = split_translate("English", df, "instruction")