In [64]:
from datasets import load_dataset, get_dataset_config_names, get_dataset_config_info, interleave_datasets
from pprint import pprint

### **Question-1**
link to dataset: https://huggingface.co/datasets/ai4bharat/naamapadam

In [13]:
len(get_dataset_config_names('ai4bharat/naamapadam'))

11

In [15]:
pprint(get_dataset_config_info('ai4bharat/naamapadam', 'hi').splits['train'])

SplitInfo(name='train',
          num_bytes=529397194,
          num_examples=985787,
          shard_lengths=[931000, 54787],
          dataset_name='naamapadam')


In [16]:
pprint(get_dataset_config_info('ai4bharat/naamapadam', 'ta'))

DatasetInfo(description='\n',
            citation='\n',
            homepage='https://indicnlp.ai4bharat.org/',
            license='Creative Commons Attribution-NonCommercial 4.0 '
                    'International Public License',
            features={'ner_tags': Sequence(feature=ClassLabel(names=['O',
                                                                     'B-PER',
                                                                     'I-PER',
                                                                     'B-ORG',
                                                                     'I-ORG',
                                                                     'B-LOC',
                                                                     'I-LOC'],
                                                              id=None),
                                           length=-1,
                                           id=None),
                      'tokens': Sequence(fe

In [18]:
pprint(get_dataset_config_info('ai4bharat/naamapadam', 'hi').features)

{'ner_tags': Sequence(feature=ClassLabel(names=['O',
                                                'B-PER',
                                                'I-PER',
                                                'B-ORG',
                                                'I-ORG',
                                                'B-LOC',
                                                'I-LOC'],
                                         id=None),
                      length=-1,
                      id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}


### **Question-2**

In [54]:
ds = load_dataset('ai4bharat/naamapadam', 'ta')

In [20]:
ds.cache_files

{'train': [{'filename': 'C:\\Users\\gagan\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-train.arrow'}],
 'test': [{'filename': 'C:\\Users\\gagan\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-test.arrow'}],
 'validation': [{'filename': 'C:\\Users\\gagan\\.cache\\huggingface\\datasets\\ai4bharat___naamapadam\\ta\\1.0.0\\9d4f21ac57d11ed4f9ea64854fdc9f5618e61acc\\naamapadam-validation.arrow'}]}

### **Question-3**

In [35]:
bytes_size_ds = get_dataset_config_info('ai4bharat/naamapadam', 'ta').size_in_bytes
size_in_mb = bytes_size_ds / (1024 * 1024)

print(f"Size in MB: {size_in_mb:.2f}")

Size in MB: 226.22


### **Question-4**

In [40]:
get_dataset_config_info(path='ai4bharat/naamapadam', config_name='ta').splits['train'].num_examples

497882

### **Question-5**

In [41]:
pprint(ds)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2795
    })
})


In [43]:
def count_tokens(x):
    x['num_tokens'] = len(x['tokens'])
    return x

ds_mapped = ds.map(count_tokens)
pprint(ds)

Map:   0%|          | 0/497882 [00:00<?, ? examples/s]

Map:   0%|          | 0/758 [00:00<?, ? examples/s]

Map:   0%|          | 0/2795 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens'],
        num_rows: 497882
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens'],
        num_rows: 758
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'num_tokens'],
        num_rows: 2795
    })
})


In [None]:
train_tok = ds_mapped['train']['num_tokens']
test_tok = ds_mapped['test']['num_tokens']
validation_tok = ds_mapped['validation']['num_tokens']

train_tokens = 0
for i in train_tok:
    train_tokens += i

test_tokens = 0
for i in test_tok:
    test_tokens += i

validation_tokens = 0
for i in validation_tok:
    validation_tokens += i

print(f"Total tokens in train: {train_tokens}")
print(f"Total tokens in test: {test_tokens}")
print(f"Total tokens in validation: {validation_tokens}")

print(f"Total tokens (in millions): {round((train_tokens + test_tokens + validation_tokens)/1000000, 0)} million")

Total tokens in train: 5959032
Total tokens in test: 9528
Total tokens in validation: 33316
Total tokens (in millions): 6.0 million


### **Question-7**

In [55]:
ds_merge = load_dataset('ai4bharat/naamapadam', 'ta', split='train+test+validation')

In [56]:
# Create the 'text' column
def create_text(example):
    example['text'] = ' '.join(example['tokens'])
    return example

ds = ds_merge.map(create_text)

# Remove the 'ner_tags' and 'tokens' columns
ds = ds.remove_columns(['ner_tags', 'tokens'])

Map:   0%|          | 0/501435 [00:00<?, ? examples/s]

In [59]:
pprint(ds)

Dataset({
    features: ['text'],
    num_rows: 501435
})


### **Question-9**

In [60]:
num_words = 6
ds_filtered = ds.filter(lambda x:len(x['text'].split(' '))>=num_words)
print(ds_filtered)

Filter:   0%|          | 0/501435 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 370495
})


### **Question-10**

In [61]:
indic_glue_tamil = load_dataset('ai4bharat/indic_glue', 'inltkh.ta', split='train+test+validation')

README.md:   0%|          | 0.00/49.5k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/124k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/126k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5346 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/669 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/669 [00:00<?, ? examples/s]

In [62]:
indic_glue_tamil[0]

{'text': 'கே.வி.ஆனந்தே ட்விட்டரில் இதை அறிவித்துள்ளார். இந்தப் படத்துக்கு கேவ்மிக் ஆரி ஒளிப்பதிவு செய்ய, ஹாரிஸ் ஜெயராஜ் இசையமைக்கிறார். பட்டுக்கோட்டை பிரபாகர் வசனம் எழுத, கலை இயக்குநராக கிரண் பணியாற்றுகிறார். இந்தப் படத்தை லைகா புரொடக்\u200cஷன்ஸ் நிறுவனம் தயாரிக்கிறது.',
 'label': 6}

In [63]:
num_words = 6
indic_glue_tamil_filtered = indic_glue_tamil.filter(lambda x:len(x['text'].split(' '))>=num_words)
print(indic_glue_tamil_filtered)

Filter:   0%|          | 0/6684 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 6428
})


In [65]:
inter_ds = interleave_datasets([ds_filtered, indic_glue_tamil_filtered], probabilities=[0.8, 0.2], seed=42)
print(inter_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 32354
})
