In [2]:

!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [None]:
import os
import pandas as pd
import configparser
from deep_translator import GoogleTranslator
import time
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Directory paths
source_dir_path = "source"
translated_dir_path = "translated"
progress_dir_path = "progress"

# Language and translation settings
translated_dialect = "hi"  # Target language (Hindi in this case)
chunk_size = 30
max_translation_chunk_length = 4000  # Max characters per translation chunk
api_limit = 5000  # API limit on characters per request

# Ensure necessary directories exist
def ensure_directories_exist():
    os.makedirs(source_dir_path, exist_ok=True)
    os.makedirs(translated_dir_path, exist_ok=True)
    os.makedirs(progress_dir_path, exist_ok=True)

# Store dataset as CSV and start translation
def store_sft_dataset(name_of_dataset, data_frame, split_type):
    file_name = os.path.join(source_dir_path, f"{name_of_dataset.replace('/', '')}{split_type}.csv")
    if not os.path.isfile(file_name):
        logging.info(f"Creating file: {file_name}")
        data_frame.to_csv(file_name, encoding="utf-8", index=False, header=True)
        logging.info(f"File written: {file_name}")

    translate_in_dialects(name_of_dataset, data_frame, split_type, translated_dialect)

# Translate text with retries for reliability
def translate_text_with_retries(text, dest_language, retries=3, delay=10):
    attempt = 0
    while attempt < retries:
        try:
            if len(text) > max_translation_chunk_length:
                return translate_large_text(text, dest_language)
            return GoogleTranslator(source='auto', target=dest_language).translate(text)
        except Exception as e:
            logging.error(f"Translation attempt {attempt + 1} failed: {e}")
            attempt += 1
            time.sleep(delay)
    raise Exception(f"Failed to translate text after {retries} attempts")

# Handle large texts by chunking them for translation
def translate_large_text(text, dest_language):
    # Split the text into chunks
    chunks = split_text(text, max_translation_chunk_length)
    translated_chunks = []
    for chunk in chunks:
        try:
            translated_chunk = GoogleTranslator(source='auto', target=dest_language).translate(chunk)
            translated_chunks.append(translated_chunk)
        except Exception as e:
            logging.error(f"Error translating chunk: {e}")
            translated_chunks.append('')  # Append an empty string if there's an error
    return ''.join(translated_chunks)

# Split text into manageable chunks
def split_text(text, max_length):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

# Save progress to a file
def save_progress(progress_file, last_translated_row):
    with open(progress_file, 'w') as f:
        f.write(str(last_translated_row))

# Load progress from a file
def load_progress(progress_file):
    if os.path.isfile(progress_file):
        with open(progress_file, 'r') as f:
            return int(f.read().strip())
    return 0

# Main translation function, handling datasets
def translate_in_dialects(name_of_dataset, data_frame, split_type, dialect_name="hi"):
    total_rows = len(data_frame)
    logging.info(f"Total rows to translate: {total_rows}")

    translated_file_name = os.path.join(translated_dir_path, f"{name_of_dataset.replace('/', '')}{split_type}_{dialect_name}_translated.csv")
    progress_file = os.path.join(progress_dir_path, f"{name_of_dataset.replace('/', '')}{split_type}_progress.txt")

    start_row = load_progress(progress_file)

    for chunk_start in range(start_row, total_rows, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_rows)
        chunk = data_frame.iloc[chunk_start:chunk_end]

        translated_append_list = []
        logging.info(f"Processing rows {chunk_start + 1} to {chunk_end}...")

        for index, row in chunk.iterrows():
            messages = row.get('messages', [])

            if not messages:
                logging.warning(f"Skipping row {index + 1} due to missing 'messages' field.")
                continue

            conversation_pairs = []
            turn_count = 0

            for i in range(0, len(messages), 2):
                if i + 1 < len(messages):
                    user_message = messages[i].get('content', '')
                    assistant_message = messages[i + 1].get('content', '')

                    if messages[i].get('role') == 'user' and messages[i + 1].get('role') == 'assistant':
                        try:
                            translated_user_message = translate_text_with_retries(user_message, dialect_name)
                            translated_assistant_message = translate_text_with_retries(assistant_message, dialect_name)

                            if translated_user_message:
                                conversation_pairs.append(translated_user_message)
                            if translated_assistant_message:
                                conversation_pairs.append(translated_assistant_message)

                            turn_count += 1
                        except Exception as e:
                            logging.error(f"Error translating pair at index {index}, pair {i // 2 + 1}: {e}")
                            continue

            if conversation_pairs:
                # Ensure all items are strings
                conversation_pairs = [str(item) for item in conversation_pairs if item is not None]
                translated_append_list.append({
                    'conversation': ' '.join(conversation_pairs),
                    'number_of_turns': turn_count
                })

                # Save progress after each row is translated successfully
                save_progress(progress_file, index)

        if translated_append_list:
            df = pd.DataFrame(translated_append_list, columns=['conversation', 'number_of_turns'])
            mode = 'w' if not os.path.isfile(translated_file_name) or chunk_start == start_row else 'a'
            header = not os.path.isfile(translated_file_name) or chunk_start == start_row
            df.to_csv(translated_file_name, encoding="utf-8", index=False, header=header, mode=mode)

            logging.info(f"Finished writing chunk {chunk_start + 1} to {chunk_end} to file {translated_file_name}")
        else:
            logging.info(f"No data to write for chunk {chunk_start + 1} to {chunk_end}.")

if __name__ == "__main__":
    ensure_directories_exist()
    config = configparser.ConfigParser()
    config.read(r"C:\Users\Nicole\Downloads\makers lab\SFT\Unconfirmed 786270.crdownload", encoding="utf-8")
    for key in config['sgpt']:
        if key.lower().strip() == "name":
            name_of_dataset = config['sgpt'][key].replace('"', '')
            splits = {"dataset": r"C:\Users\Nicole\Downloads\makers lab\SFT\source\sharegpt_entries1.json"}
            df_1 = pd.read_json(splits["dataset"])

            store_sft_dataset(name_of_dataset, df_1, "dataset")

2024-08-21 20:14:26,103 - Creating file: source\sgptdataset.csv
2024-08-21 20:14:39,219 - File written: source\sgptdataset.csv
2024-08-21 20:14:39,221 - Total rows to translate: 41617
2024-08-21 20:14:39,223 - Processing rows 1 to 30...
2024-08-21 20:19:24,204 - Finished writing chunk 1 to 30 to file translated\sgptdataset_hi_translated.csv
2024-08-21 20:19:24,221 - Processing rows 31 to 60...
2024-08-21 20:23:19,226 - Finished writing chunk 31 to 60 to file translated\sgptdataset_hi_translated.csv
2024-08-21 20:23:19,228 - Processing rows 61 to 90...
2024-08-21 20:27:51,615 - Finished writing chunk 61 to 90 to file translated\sgptdataset_hi_translated.csv
2024-08-21 20:27:51,617 - Processing rows 91 to 120...
2024-08-21 20:31:59,666 - Finished writing chunk 91 to 120 to file translated\sgptdataset_hi_translated.csv
2024-08-21 20:31:59,668 - Processing rows 121 to 150...
2024-08-21 20:36:03,346 - Finished writing chunk 121 to 150 to file translated\sgptdataset_hi_translated.csv
2024-08

In [None]:
import os
import pandas as pd
import configparser
from deep_translator import GoogleTranslator
import time
import torch
import random

source_dir_path = "source"
translated_dir_path = "translated"
progress_dir_path = "progress"
config = configparser.ConfigParser(default_section="DATASETS")

translated_dialect = "hi"
chunk_size = 100

device = "cuda" if torch.cuda.is_available() else "cpu"

def ensure_directories_exist():
    os.makedirs(source_dir_path, exist_ok=True)
    os.makedirs(translated_dir_path, exist_ok=True)
    os.makedirs(progress_dir_path, exist_ok=True)

def store_sft_dataset(name_of_dataset, data_frame, split_type):
    file_name = str(os.path.join(source_dir_path, name_of_dataset.replace("/", "_"))) + "train.csv"
    if not os.path.isfile(file_name):
        print("Opening file.....", file_name)
        data_frame.to_csv(file_name, index=False, header=True, encoding='utf-8-sig')  # Specify encoding
        print("Finished writing file....", file_name)


    translate_in_dialects(name_of_dataset, data_frame, split_type, translated_dialect)

def translate_text_with_retries(text, dest_language, retries=3, delay=5):
    attempt = 0
    translator = GoogleTranslator(source='auto', target=dest_language)
    while attempt < retries:
        try:
            return translator.translate(text)
        except Exception as e:
            print(f"Translation attempt {attempt + 1} failed: {e}")
            attempt += 1
            time.sleep(delay)
    raise Exception(f"Failed to translate text after {retries} attempts")

def translate_in_dialects(name_of_dataset, data_frame, split_type, dialect_name="hi"):
    total_rows = len(data_frame)
    print(f"Total rows to translate: {total_rows}")

    translated_file_name = str(os.path.join(translated_dir_path,
                                 name_of_dataset.replace("/", "_")
                                 )) + split_type + "_" + dialect_name + "_translated.csv"

    # Check how many rows have already been processed
    start_row = 0
    if os.path.isfile(translated_file_name):
        try:
            existing_translations = pd.read_csv(translated_file_name)
            start_row = len(existing_translations)
            if start_row > total_rows:
                print(f"Warning: Existing translations file has more rows ({start_row}) than the dataset ({total_rows}).")
                start_row = 0  # Reset start_row to 0 to reprocess the entire dataset
            else:
                print(f"Resuming from row {start_row + 1}...")
        except Exception as e:
            print(f"Error reading existing translations file: {e}")
            start_row = 0  # Reset if there's an error reading the file

    for chunk_start in range(start_row, total_rows, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_rows)
        chunk = data_frame.iloc[chunk_start:chunk_end]

        translated_append_list = []
        print(f"Processing rows {chunk_start + 1} to {chunk_end}...")

        for index, val in chunk.iterrows():
            Conversation = val.get("Conversation", "")
            Label = val.get("Label", "")

            if not Conversation or not Label:
                print(f"Skipping row {index + 1} due to missing data.")
                continue

            try:
                conversations = chunk["Conversation"].tolist()
                labels = chunk["Label"].tolist()

                time.sleep(random.uniform(3, 5))

                translated_conversations = GoogleTranslator(source='auto', target=dialect_name).translate_batch(conversations)
                translated_labels = GoogleTranslator(source='auto', target=dialect_name).translate_batch(labels)

                translated_append_list = [{'Conversation': c, 'Label': l} for c, l in zip(translated_conversations, translated_labels)]


            except Exception as e:
                print(f"Error translating index {index}: {e}")

        if translated_append_list:
            df = pd.DataFrame(translated_append_list)
            if not os.path.isfile(translated_file_name) or chunk_start == start_row:
                df.to_csv(translated_file_name, index=False, header=True, mode='w', encoding='utf-8-sig')  # Specify encoding
            else:
                df.to_csv(translated_file_name, index=False, header=False, mode='a', encoding='utf-8-sig')  # Specify encoding

            print(f"Finished writing chunk {chunk_start + 1} to {chunk_end} to file {translated_file_name}")


if __name__ == "__main__":
    ensure_directories_exist()
    # Update the path to your actual dataset file
    dataset_path = r"/content/data/fraud_call_dataset.csv"  # Added `r` to handle special characters in the path

    # Check if the file exists
    if not os.path.isfile(dataset_path):
        raise FileNotFoundError(f"Dataset file not found: {dataset_path}")

    try:
        # Read the CSV file into a pandas DataFrame with encoding fallback
        try:
            df_1 = pd.read_csv(dataset_path, encoding='utf-8')  # Try reading with UTF-8
        except UnicodeDecodeError:
            print("UTF-8 decoding failed. Retrying with ISO-8859-1 encoding...")
            df_1 = pd.read_csv(dataset_path, encoding='ISO-8859-1')  # Fallback to ISO-8859-1

        name_of_dataset = "fraud_call_dataset"  # Added dataset name explicitly
        store_sft_dataset(name_of_dataset, df_1, "dataset")
    except Exception as e:
        print(f"Error reading dataset file: {e}")


Total rows to translate: 20075
Resuming from row 71...
Processing rows 71 to 170...


KeyboardInterrupt: 

In [14]:
import os
import pandas as pd
import time
import random
from deep_translator import DeeplTranslator

# Paths
SOURCE_DIR = "source"
TRANSLATED_DIR = "translated"
PROGRESS_DIR = "progress"

# Settings
TRANSLATED_LANG = "hi"  # Hindi translation
CHUNK_SIZE = 5000  # Process in chunks of 5000
MAX_RETRIES = 3  # Retries per request
INITIAL_DELAY = 5  # Base delay for backoff
MAX_DELAY = 30  # Max delay

# Ensure directories exist
os.makedirs(SOURCE_DIR, exist_ok=True)
os.makedirs(TRANSLATED_DIR, exist_ok=True)
os.makedirs(PROGRESS_DIR, exist_ok=True)

# Set your DeepL API Key
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")  # Make sure to set this in your environment
if not DEEPL_API_KEY:
    raise ValueError("DeepL API key not found. Please set the DEEPL_API_KEY environment variable.")

def translate_text(text, target_lang="hi"):
    """
    Translates text using DeepL API.
    """
    translator = DeepL(api_key=DEEPL_API_KEY)
    return translator.translate(text, target_lang=target_lang)

def translate_batch_with_retries(text_list, target_lang="hi", max_retries=MAX_RETRIES):
    """
    Translates a batch of text with retries and exponential backoff.
    """
    attempt = 0
    delay = INITIAL_DELAY
    while attempt < max_retries:
        try:
            translated_texts = [translate_text(text, target_lang) for text in text_list]
            return translated_texts
        except Exception as e:
            print(f"Batch translation failed (attempt {attempt + 1}): {e}")
            attempt += 1
            time.sleep(delay)
            delay = min(delay * 2, MAX_DELAY)
    print("Failed to translate batch after multiple attempts.")
    return text_list

def translate_dataset(name_of_dataset, data_frame, split_type):
    """
    Translates a dataset in large chunks while handling API limits.
    """
    total_rows = len(data_frame)
    print(f"Total rows to translate: {total_rows}")

    translated_file_path = os.path.join(TRANSLATED_DIR, f"{name_of_dataset}_{split_type}_hi_translated.csv")

    # Determine start row to resume progress
    start_row = 0
    if os.path.isfile(translated_file_path):
        try:
            existing_data = pd.read_csv(translated_file_path)
            start_row = len(existing_data)
            if start_row > total_rows:
                print(f"Warning: Translated file has more rows ({start_row}) than dataset ({total_rows}). Resetting progress.")
                start_row = 0
            else:
                print(f"Resuming from row {start_row + 1}...")
        except Exception as e:
            print(f"Error reading existing translations: {e}")
            start_row = 0

    for chunk_start in range(start_row, total_rows, CHUNK_SIZE):
        chunk_end = min(chunk_start + CHUNK_SIZE, total_rows)
        chunk = data_frame.iloc[chunk_start:chunk_end]

        conversations = chunk["Conversation"].dropna().tolist()
        labels = chunk["Label"].dropna().tolist()

        if not conversations or not labels:
            print(f"Skipping empty batch from row {chunk_start + 1} to {chunk_end}.")
            continue

        try:
            print(f"Translating rows {chunk_start + 1} to {chunk_end}...")
            translated_conversations = translate_batch_with_retries(conversations, "hi")
            translated_labels = translate_batch_with_retries(labels, "hi")

            translated_data = pd.DataFrame({"Conversation": translated_conversations, "Label": translated_labels})
            mode = 'w' if chunk_start == start_row else 'a'
            header = chunk_start == start_row

            translated_data.to_csv(translated_file_path, index=False, header=header, mode=mode, encoding='utf-8-sig')

            print(f"Saved rows {chunk_start + 1} to {chunk_end}.")
        except Exception as e:
            print(f"Error translating batch: {e}")

        time.sleep(random.uniform(5, 15))

if __name__ == "__main__":
    dataset_path = r"/content/data/fraud_call_dataset.csv"  # Adjust path

    if not os.path.isfile(dataset_path):
        raise FileNotFoundError(f"Dataset not found: {dataset_path}")

    try:
        try:
            df = pd.read_csv(dataset_path, encoding='utf-8')
        except UnicodeDecodeError:
            print("UTF-8 failed, using ISO-8859-1...")
            df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

        dataset_name = "fraud_call_dataset"
        translate_dataset(dataset_name, df, "dataset")

    except Exception as e:
        print(f"Error loading dataset: {e}")


ValueError: DeepL API key not found. Please set the DEEPL_API_KEY environment variable.

In [25]:
import os
import pandas as pd
import time
import random
from deep_translator import GoogleTranslator

# Paths
SOURCE_DIR = "source"
TRANSLATED_DIR = "translated"
PROGRESS_DIR = "progress"

# Settings
TRANSLATED_LANG = "hi"  # Change to your desired language
CHUNK_SIZE = 500  # Large batch processing
MAX_RETRIES = 3  # Retries per batch
INITIAL_DELAY = 5  # Base delay for backoff
MAX_DELAY = 30  # Max delay to prevent infinite wait

# Ensure directories exist
os.makedirs(SOURCE_DIR, exist_ok=True)
os.makedirs(TRANSLATED_DIR, exist_ok=True)
os.makedirs(PROGRESS_DIR, exist_ok=True)

def translate_batch_with_retries(text_list, target_lang, max_retries=MAX_RETRIES):
    """
    Translates a batch of text with retries and exponential backoff.
    """
    translator = GoogleTranslator(source='auto', target=target_lang)
    attempt = 0
    delay = INITIAL_DELAY

    while attempt < max_retries:
        try:
            return translator.translate_batch(text_list)
        except Exception as e:
            print(f"Batch translation failed (attempt {attempt + 1}): {e}")
            attempt += 1
            time.sleep(delay)
            delay = min(delay * 2, MAX_DELAY)  # Exponential backoff with max limit

    print("Failed to translate batch after multiple attempts.")
    return text_list  # Return original text if all attempts fail

def translate_dataset(name_of_dataset, data_frame, split_type, dialect_name="hi"):
    """
    Translates a dataset in large chunks while handling API limits.
    """
    total_rows = len(data_frame)
    print(f"Total rows to translate: {total_rows}")

    translated_file_path = os.path.join(TRANSLATED_DIR, f"{name_of_dataset}_{split_type}_{dialect_name}_translated.csv")

    # Determine start row to resume progress
    start_row = 0
    if os.path.isfile(translated_file_path):
        try:
            existing_data = pd.read_csv(translated_file_path)
            start_row = len(existing_data)
            if start_row > total_rows:
                print(f"Warning: Translated file has more rows ({start_row}) than dataset ({total_rows}). Resetting progress.")
                start_row = 0
            else:
                print(f"Resuming from row {start_row + 1}...")
        except Exception as e:
            print(f"Error reading existing translations: {e}")
            start_row = 0  # Reset progress if there's an error

    # Process in large chunks of 5000
    for chunk_start in range(start_row, total_rows, CHUNK_SIZE):
        chunk_end = min(chunk_start + CHUNK_SIZE, total_rows)
        chunk = data_frame.iloc[chunk_start:chunk_end]

        conversations = chunk["Conversation"].dropna().tolist()
        labels = chunk["Label"].dropna().tolist()

        if not conversations or not labels:
            print(f"Skipping empty batch from row {chunk_start + 1} to {chunk_end}.")
            continue

        try:
            print(f"Translating rows {chunk_start + 1} to {chunk_end}...")
            translated_conversations = translate_batch_with_retries(conversations, dialect_name)
            translated_labels = translate_batch_with_retries(labels, dialect_name)

            translated_data = pd.DataFrame({"Conversation": translated_conversations, "Label": translated_labels})

            mode = 'w' if chunk_start == start_row else 'a'
            header = chunk_start == start_row

            translated_data.to_csv(translated_file_path, index=False, header=header, mode=mode, encoding='utf-8-sig')

            print(f"Saved rows {chunk_start + 1} to {chunk_end}.")
        except Exception as e:
            print(f"Error translating batch: {e}")

        # Randomized delay (5-15 sec) to prevent rate limiting
        time.sleep(random.uniform(5, 15))

if __name__ == "__main__":
    dataset_path = r"/content/data/fraud_call_dataset.csv"  # Adjust path

    if not os.path.isfile(dataset_path):
        raise FileNotFoundError(f"Dataset not found: {dataset_path}")

    try:
        # Read dataset with encoding fallback
        try:
            df = pd.read_csv(dataset_path, encoding='utf-8')
        except UnicodeDecodeError:
            print("UTF-8 failed, using ISO-8859-1...")
            df = pd.read_csv(dataset_path, encoding='ISO-8859-1')

        dataset_name = "fraud_call_dataset"
        translate_dataset(dataset_name, df, "dataset", TRANSLATED_LANG)

    except Exception as e:
        print(f"Error loading dataset: {e}")

Total rows to translate: 20075
Resuming from row 4001...
Translating rows 4001 to 4500...
Batch translation failed (attempt 1): Server Error: You made too many requests to the server.According to google, you are allowed to make 5 requests per secondand up to 200k requests per day. You can wait and try again later oryou can try the translate_batch function
Batch translation failed (attempt 2): Server Error: You made too many requests to the server.According to google, you are allowed to make 5 requests per secondand up to 200k requests per day. You can wait and try again later oryou can try the translate_batch function


KeyboardInterrupt: 