In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/xtremepanx/test.nl
/kaggle/input/xtremepanx/test.vi
/kaggle/input/xtremepanx/test.fr
/kaggle/input/xtremepanx/dev.zh
/kaggle/input/xtremepanx/test.zh
/kaggle/input/xtremepanx/dev.nl
/kaggle/input/xtremepanx/train.zh
/kaggle/input/xtremepanx/train.vi
/kaggle/input/xtremepanx/dev.en
/kaggle/input/xtremepanx/dev.vi
/kaggle/input/xtremepanx/dev.fr
/kaggle/input/xtremepanx/train.fr
/kaggle/input/xtremepanx/test.en
/kaggle/input/xtremepanx/train.nl
/kaggle/input/xtremepanx/train.en


In [2]:
%cd /kaggle/input/xtremepanx

/kaggle/input/xtremepanx


In [3]:
from datasets import Dataset, ClassLabel, Value, Features, Sequence

def convert_to_hf_dataset(input_file, class_names):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sentences = []
    current_tokens = []
    current_tags = []

    for line in lines:
        line = line.strip()
        if line:
            language, token_tag = line.split(':', 1)
            token, tag = token_tag.split('\t')
            current_tokens.append(token)
            current_tags.append(tag)
        else:  # New sentence
            sentences.append({'tokens': current_tokens, 'ner_tags': current_tags})
            current_tokens = []
            current_tags = []

    # Handle the last sentence if there is one
    if current_tokens:
        sentences.append({'tokens': current_tokens, 'ner_tags': current_tags})

    # Define the ClassLabel feature for ner_tags
    class_labels = ClassLabel(names=class_names)

    # Define the Features for the Hugging Face dataset
    features = Features({
        'tokens': Sequence(Value('string')),
        'ner_tags': Sequence(class_labels)
    })

    # Convert the list of dictionaries to a dictionary of lists
    hf_dataset = {key: [entry[key] for entry in sentences] for key in sentences[0]}

    return Dataset.from_dict(hf_dataset, features=features)

# Example usage for three input files
class_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
#dataset_3 = convert_to_hf_dataset('test.vi', class_names)

# Save the datasets to disk
#dataset_3.save_to_disk('/kaggle/working/output_dataset_2')


In [4]:
from datasets import load_dataset, DatasetDict

# Assuming you have already converted the individual files to Hugging Face datasets
class_names = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

train_dataset = convert_to_hf_dataset('train.vi', class_names)
dev_dataset = convert_to_hf_dataset('dev.vi', class_names)
test_dataset = convert_to_hf_dataset('test.vi', class_names)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset,
    'test': test_dataset,
})


In [5]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 10000
    })
})

In [6]:
%cd /kaggle/input/xtremepanx

/kaggle/input/xtremepanx


In [7]:
from datasets import DatasetDict
from collections import defaultdict

# Assuming you have a list of language codes
languages = ['en', 'vi', 'fr', 'nl', 'zh']  # Add more languages as needed

# Initialize a defaultdict of DatasetDict
all_datasets = defaultdict(DatasetDict)

# Iterate over each language
for lang in languages:
    train_file = f'train.{lang}'
    dev_file = f'dev.{lang}'
    test_file = f'test.{lang}'

    # Convert individual language files to Hugging Face datasets
    train_dataset = convert_to_hf_dataset(train_file, class_names)
    dev_dataset = convert_to_hf_dataset(dev_file, class_names)
    test_dataset = convert_to_hf_dataset(test_file, class_names)

    # Create a DatasetDict for the language
    lang_dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': dev_dataset,
        'test': test_dataset,
    })

    # Add the language DatasetDict to the overall defaultdict
    all_datasets[lang] = lang_dataset_dict



In [8]:
all_datasets

defaultdict(datasets.dataset_dict.DatasetDict,
            {'en': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 20000
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
                 })
             }),
             'vi': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 20000
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
      

In [9]:
# Save the defaultdict of DatasetDict
for lang, lang_dataset_dict in all_datasets.items():
    lang_dataset_dict.save_to_disk(f'/kaggle/working/save/{lang}_datasets')

In [10]:
!zip -r /kaggle/working/save.zip /kaggle/working/save

  adding: kaggle/working/save/ (stored 0%)
  adding: kaggle/working/save/vi_datasets/ (stored 0%)
  adding: kaggle/working/save/vi_datasets/dataset_dict.json (deflated 5%)
  adding: kaggle/working/save/vi_datasets/validation/ (stored 0%)
  adding: kaggle/working/save/vi_datasets/validation/dataset.arrow (deflated 75%)
  adding: kaggle/working/save/vi_datasets/validation/dataset_info.json (deflated 65%)
  adding: kaggle/working/save/vi_datasets/validation/state.json (deflated 40%)
  adding: kaggle/working/save/vi_datasets/train/ (stored 0%)
  adding: kaggle/working/save/vi_datasets/train/dataset.arrow (deflated 75%)
  adding: kaggle/working/save/vi_datasets/train/dataset_info.json (deflated 65%)
  adding: kaggle/working/save/vi_datasets/train/state.json (deflated 40%)
  adding: kaggle/working/save/vi_datasets/test/ (stored 0%)
  adding: kaggle/working/save/vi_datasets/test/dataset.arrow (deflated 75%)
  adding: kaggle/working/save/vi_datasets/test/dataset_info.json (deflated 65%)
  addi

In [1]:
%cd /kaggle/working/save

/kaggle/working/save


In [2]:
from datasets import DatasetDict
from collections import defaultdict


In [3]:
languages = ['en', 'vi', 'fr', 'nl', 'zh']

In [17]:
loaded_datasets2 = defaultdict(DatasetDict)
for lang in languages:
    loaded_datasets2[lang] = DatasetDict.load_from_disk(f'{lang}_datasets')


In [18]:
loaded_datasets2

defaultdict(datasets.dataset_dict.DatasetDict,
            {'en': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 20000
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
                 })
             }),
             'vi': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 20000
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags'],
                     num_rows: 10000
      

# Examine tags distribution 

In [20]:
for lang in languages:
    tags = loaded_datasets2[lang]["train"].features["ner_tags"].feature
    
    def create_tag_names(batch):
        return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
    
    # Map the create_tag_names function to each split in the language
    for split in loaded_datasets2[lang].keys():
        loaded_datasets2[lang][split] = loaded_datasets2[lang][split].map(create_tag_names)


  0%|          | 0/20000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/20000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [33]:
split2freqs = defaultdict(Counter)

# Iterate over each language and split
for lang in languages:
    for split, dataset in loaded_datasets2[lang].items():
        for row in dataset["ner_tags_str"]:
            for tag in row:
                if tag.startswith("B"):
                    tag_type = tag.split("-")[1]
                    split2freqs[(lang, split)][tag_type] += 1

# Convert the result to a DataFrame
result_df = pd.DataFrame.from_dict(split2freqs, orient="columns").fillna(0)

# Display the result
print(result_df)


       en                     vi                     fr                   \
    train validation  test train validation  test train validation  test   
ORG  9422       4677  4745  7364       3621  3704  7764       3910  3885   
PER  9164       4635  4556  7470       3738  3884  8965       4406  4499   
LOC  9345       4834  4657  7588       3832  3717  9718       4840  4985   

       nl                     zh                   
    train validation  test train validation  test  
ORG  7778       3943  3908  7684       3950  3779  
PER  9308       4754  4684  7879       3845  3899  
LOC  9964       4835  5133  8572       4222  4371  
