In [1]:
import os
import lzma
import datasets
import itertools

import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Sample from the CommonCrawl data (minus English)

In [2]:
input_dir = '/shared/4/datasets/CommonCrawl/'
output_dir = '/shared/3/projects/hiatus/multilingual/'
code_to_language = {
    'af': 'Afrikaans',
    'am': 'Amharic',
    'ar': 'Arabic',
    'as': 'Assamese',
    'az': 'Azerbaijani',
    'be': 'Belarusian',
    'bg': 'Bulgarian',
    'bn': 'Bengali',
    'br': 'Breton',
    'bs': 'Bosnian',
    'ca': 'Catalan',
    'cs': 'Czech',
    'cy': 'Welsh',
    'da': 'Danish',
    'de': 'German',
    'el': 'Greek',
    'eo': 'Esperanto',
    'es': 'Spanish',
    'et': 'Estonian',
    'eu': 'Basque',
    'fa': 'Persian',
    'fi': 'Finnish',
    'fr': 'French',
    'fy': 'Western Frisian',
    'ga': 'Irish',
    'gd': 'Scottish Gaelic',
    'gl': 'Galician',
    'gu': 'Gujarati',
    'ha': 'Hausa',
    'he': 'Hebrew',
    'hi': 'Hindi',
    'hr': 'Croatian',
    'ht': 'Haitian',
    'hu': 'Hungarian',
    'hy': 'Armenian',
    'id': 'Indonesian',
    'ig': 'Igbo',
    'is': 'Icelandic',
    'it': 'Italian',
    'ja': 'Japanese',
    'jv': 'Javanese',
    'ka': 'Georgian',
    'kk': 'Kazakh',
    'km': 'Khmer',
    'kn': 'Kannada',
    'ko': 'Korean',
    'ku': 'Kurdish',
    'ky': 'Kyrgyz',
    'la': 'Latin',
    'lg': 'Ganda',
    'li': 'Limburgish',
    'ln': 'Lingala',
    'lo': 'Lao',
    'lt': 'Lithuanian',
    'lv': 'Latvian',
    'mg': 'Malagasy',
    'mk': 'Macedonian',
    'ml': 'Malayalam',
    'mn': 'Mongolian',
    'mr': 'Marathi',
    'ms': 'Malay',
    'my': 'Burmese',
    'ne': 'Nepali',
    'nl': 'Dutch',
    'no': 'Norwegian',
    'ns': 'Northern Sotho',
    'om': 'Oromo',
    'or': 'Odia',
    'pa': 'Punjabi',
    'pl': 'Polish',
    'ps': 'Pashto',
    'pt': 'Portuguese',
    'qu': 'Quechua',
    'rm': 'Romansh',
    'ro': 'Romanian',
    'ru': 'Russian',
    'sa': 'Sanskrit',
    'sc': 'Sardinian',
    'sd': 'Sindhi',
    'si': 'Sinhala',
    'sk': 'Slovak',
    'sl': 'Slovene',
    'so': 'Somali',
    'sq': 'Albanian',
    'sr': 'Serbian',
    'ss': 'Swati',
    'su': 'Sundanese',
    'sv': 'Swedish',
    'sw': 'Swahili',
    'ta': 'Tamil',
    'te': 'Telugu',
    'th': 'Thai',
    'tl': 'Tagalog',
    'tn': 'Tswana',
    'tr': 'Turkish',
    'ug': 'Uighur',
    'uk': 'Ukrainian',
    'ur': 'Urdu',
    'vi': 'Vietnamese',
    'wo': 'Wolof',
    'xh': 'Xhosa',
    'yi': 'Yiddish',
    'yo': 'Yoruba',
    'zu': 'Zulu',
}

len(code_to_language)

104

In [3]:
sample_per_language = 250000
language_rows = []

def get_samples():
    for code, language in tqdm(code_to_language.items()):
        fp = os.path.join(input_dir, f"{code}.txt.xz")
        with lzma.open(fp, "rt") as file:
            sample_count = 0
            for line in file:
                if len(line.strip()) > 100:
                    yield {
                        'code': code,
                        'language': language,
                        'text': line.strip()
                    }
                    sample_count += 1
                    if sample_count >= sample_per_language:
                        break

language_rows = get_samples()           
languages_df = pd.DataFrame(language_rows)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104/104 [03:03<00:00,  1.76s/it]


In [4]:
languages_df = languages_df.sample(frac=1).reset_index(drop=True)
languages_df.head()

Unnamed: 0,code,language,text
0,ga,Irish,"b) cosaintí nach bhfuil de ghnáth ""tar éis tré..."
1,sw,Swahili,"Waandishi kutoka vyombo mbalimbali vya habari,..."
2,ta,Tamil,வன்னியில் வளம் மிக்க பிரதேசங்களைப் உயர் பாதுகா...
3,ja,Japanese,12月12日(火)雪が降っています。「私、雪が好きなの」とつぶやいているお年寄りがおられまし...
4,ms,Malay,ABDUL Khalid Ibrahim (tengah) bersama Exco Sel...


In [5]:
len(languages_df)

22766983

In [6]:
# Save the result
out_file = os.path.join(output_dir, 'language_text_samples.parquet.gzip')
languages_df.to_parquet(out_file, compression='gzip')

## Convert to dataset files

In [7]:
train, test = train_test_split(languages_df, test_size=0.15)
dev, test = train_test_split(test, test_size=0.66)
print(len(train), len(dev), len(test))

19351935 1161116 2253932


In [8]:
train_dataset = datasets.Dataset.from_pandas(train)
dev_dataset = datasets.Dataset.from_pandas(dev)
test_dataset = datasets.Dataset.from_pandas(test)

In [9]:
train_out = os.path.join(output_dir, 'train.jsonl')
dev_out = os.path.join(output_dir, 'dev.jsonl')
test_out = os.path.join(output_dir, 'test.jsonl')
print(train_out)

/shared/3/projects/hiatus/multilingual/train.jsonl


In [10]:
train_dataset.to_json(train_out, orient='records', lines=True)
dev_dataset.to_json(dev_out, orient='records', lines=True)
test_dataset.to_json(test_out, orient='records', lines=True)

Creating json from Arrow format:   0%|          | 0/1936 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/117 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/226 [00:00<?, ?ba/s]

1869878854

In [11]:
len(train_dataset)

19351935