## Language Translation Dataset Caching Notebook

Run this script to re-produce caching the [CalibraGPT/Fact-Completion](https://huggingface.co/datasets/CalibraGPT/Fact-Completion)
dataset's Non-English splits. Original sources cited in the project's [README](https://github.com/daniel-furman/Capstone)

<a target="_blank" href="https://colab.research.google.com/github/daniel-furman/Polyglot-or-Not/blob/main/notebooks/dataset_caching_notebooks/dataset-preprocessing-language-translation.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


## Colab + Drive setup

In [None]:
from google.colab import drive

# Attach drive
drive.mount("/content/drive")

In [None]:
# Global var for folder to save translated parquets to
DRIVE_FOLDER_OUT = "/content/drive/MyDrive/Colab Files/translated_data/"

## Dependencies

In [None]:
!git clone https://github.com/daniel-furman/Capstone.git
!pip install -r /content/Capstone/requirements.txt

In [None]:
import re
import datetime
import time
import pandas as pd
import os
import tqdm
from argparse import ArgumentParser
from argparse import Namespace

from datasets import load_dataset
from deep_translator import GoogleTranslator

In [None]:
# different language options
GoogleTranslator().get_supported_languages(as_dict=True)

## Run

In [None]:
os.chdir("/content/Capstone/src/dataset_caching_scripts")
from language_translation_helper import main

In [None]:
# languages to translate sequentially
# args_lang_list = ['fr', 'es', 'de', 'uk', 'bg', 'ca', 'cs', 'da', 'hr', 'hu', 'it', 'nl', 'pl', 'pt', 'ro', 'ru', 'sl', 'sr', 'sv']

args_lang_list = ["fr"]

for arg_lang in args_lang_list:
    now = datetime.datetime.now()
    dt_string = now.strftime("%d-%m-%Y-%H-%M-%S")

    args = Namespace(
        # iso code for language, options included in below
        language=arg_lang,
        datetime=dt_string,
        # start and end its, can be from 0 to 26254
        start_index=0,
        end_index=6563,
    )

    print("\n", arg_lang, ": \n")
    main(args)

    local_path = f"/content/{args.language}-fact-completion-{args.datetime}-startindex-{args.start_index}-endindex-{args.end_index}.parquet"
    drive_filename = f"{args.language}-fact-completion-{args.datetime}-startindex-{args.start_index}-endindex-{args.end_index}.parquet"
    drive_path = os.path.join(DRIVE_FOLDER_OUT, drive_filename)
    !cp {local_path} '{drive_path}'

    # confirmed it saved correctly
    print(pd.read_parquet(drive_path))