In [1]:
import tensorflow as tf
from transformers import TFAutoModel ,AutoTokenizer, \
    TFBertForSequenceClassification, DataCollatorWithPadding
# import tensorflow_hub as hub
# import tensorflow_text as text
# from PIL import Image
import os
from datasets import load_dataset,Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

After reading, I think it is better to manipulate the data in pandas in the following ways;
- Data cleaning in pandas
- Data splitting: saving as "csv", etc
- Then loading the files; train,test and validation as one whole dataset using load_dataset.
> Refer to the link below 

## **Dataset Library**

In [None]:
# https://huggingface.co/learn/nlp-course/chapter5/2?fw=tf#loading-a-local-dataset

In [2]:
data = pd.read_csv("data/cleaned_reviews.csv")
data.drop(columns=['cleaned_review_length','review_score'],inplace=True)
data.dropna(axis=0,how='any',inplace=True)

In [3]:
len(data)

17337

In [4]:
def percentage_splitter(test_percent:float,data:pd.DataFrame,label:str,train_percent=None,):
    return (train_test_split(
                    data,
                    test_size=test_percent,
                    train_size=train_percent,
                    random_state=42,
                    shuffle=True,
                    stratify=data[label]))


In [5]:
percent_check = lambda x,y: (len(x)/len(y)) *100

**Data splitting**

In [6]:
# Train-test split
train_data,kept_data  = percentage_splitter(test_percent=.30,
                                            data=data,
                                            label="sentiments")

In [7]:
# Test-val split
val_data,test_data = percentage_splitter(
                                        test_percent=.15,
                                        data=kept_data,
                                        label="sentiments")

In [8]:
print(f"Train split with original dataset: {round((percent_check(train_data,data)))} %")
print(f"Kept data split with original dataset: {round(percent_check((kept_data),data))} %")
print(f"Test data split with kept dataset: {round(percent_check(test_data,(kept_data)))} %")
print(f"Validation split with kept dataset: {round(percent_check(val_data,(kept_data)))} %")

Train split with original dataset: 70 %
Kept data split with original dataset: 30 %
Test data split with kept dataset: 15 %
Validation split with kept dataset: 85 %


In [9]:
# Checking the split
d = train_data + test_data + val_data
len(data)- len(d) 

0

In [10]:
%%time
val_data.to_csv("data/val.csv",index=False)
test_data.to_csv("data/test.csv",index=False)
train_data.to_csv("data/train.csv",index=False)
# # CPU times: total: 46.9 ms
# # Wall time: 250 ms

CPU times: total: 46.9 ms
Wall time: 250 ms


In [11]:
%%time
val_data.to_csv("data/val.csv",index=False,chunksize=100)
test_data.to_csv("data/test.csv",index=False,chunksize=100)
train_data.to_csv("data/train.csv",index=False,chunksize=100)
# # CPU times: total: 78.1 ms
# # Wall time: 143 ms

CPU times: total: 78.1 ms
Wall time: 143 ms


### **Loading the dataset using HuggingFace datasets**

In [12]:
%%time
data_dicts = {
    "train":"data/train.csv",
    "validation":"data/val.csv",
    "test":"data/test.csv"
}

raw_dataset = load_dataset('csv',data_files=data_dicts)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

CPU times: total: 109 ms
Wall time: 1.69 s


In [13]:
# Reload from here
raw_dataset_manipulations = raw_dataset

In [363]:
# raw_dataset_manipulations

In [364]:
# print(raw_dataset_manipulations['train']['sentiments'][:10])
# print(raw_dataset_manipulations['validation']['sentiments'][:10])
# print(raw_dataset_manipulations['test']['sentiments'][:10])

**Selecting a data range**

In [365]:
# shuffled = raw_dataset_manipulations['train'].shuffle(seed=42).select(range(100))
# print(shuffled)
# print(shuffled["cleaned_review"][:2])

**Renaming columns**

In [366]:
# shuffled_col_renamed = shuffled.rename_columns(
#     {'sentiments':"labels", 
#     'cleaned_review':"reviews"}
# )
# # shuffled_col_renamed # It works on shuffled selected data range

In [367]:
# print(shuffled_col_renamed["reviews"][:2])

In [14]:
raw_dataset_manipulations = raw_dataset_manipulations.rename_columns(
    {'sentiments':"labels", 
    'cleaned_review':"reviews"}
)

In [15]:
raw_dataset_manipulations

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})

In [370]:
# raw_dataset_manipulations['train']['reviews'][:2]

In [16]:
# Adding a None value in train for experimentation
add_none = {
    'labels':None, "reviews":None
}
raw_dataset_manipulations['train'] = raw_dataset_manipulations['train'].add_item(add_none)
raw_dataset_manipulations

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12136
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})

In [17]:
if ((raw_dataset_manipulations['train']['labels'][-1]) and (raw_dataset_manipulations['train']['reviews'][-1])) is None: print("yes")

yes


In [18]:
### Replicated the Error as shown in the tutorial:
### https://huggingface.co/learn/nlp-course/chapter5/3?fw=tf
### Method-1
raw_dataset_manipulations['train']['reviews'] = [
    review.lower() for review in raw_dataset_manipulations['train']['reviews']
]

AttributeError: 'NoneType' object has no attribute 'lower'

In [19]:
# Method-2
def lowercase_condition(example):
    return {"reviews": example["reviews"].lower()}
raw_dataset_manipulations.map(lowercase_condition)

Map:   0%|          | 0/12136 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [20]:
# Method-3: more dynamic
def lowercase_condition(row, column):
    row[column] = row[column].lower()
    return row

# Apply the lowercase_condition function using apply
raw_dataset_manipulations = raw_dataset_manipulations.map(lambda x: lowercase_condition(x,"reviews"))
raw_dataset_manipulations

Map:   0%|          | 0/12136 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [21]:
# Removing all rows with Nonetype; missing values
raw_dataset_manipulations['train'] = (raw_dataset_manipulations['train'].
                                filter(lambda x: x['reviews'] 
                                        is not None)
                                )                                

Filter:   0%|          | 0/12136 [00:00<?, ? examples/s]

In [22]:
raw_dataset_manipulations

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})

In [23]:
# Method-2
def lowercase_condition(example):
    return {"reviews": example["reviews"].lower()}
raw_dataset_manipulations = raw_dataset_manipulations.map(lowercase_condition)

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

In [24]:
raw_dataset_manipulations['train']['reviews'][:3]

['absolutely love the sound',
 'i can hear everything even when my game is on loud do not buy for noise cancelling rated stars for false advertising',
 'pretty solid keyboard quiet and nice feel to it seems to be constructed well ']

**Creating columns**

In [25]:
def compute_review_length(example):
    return {"review_length": len(example["reviews"].split())}
raw_dataset_manipulations = raw_dataset_manipulations.map(compute_review_length)
print(raw_dataset_manipulations)
print(f"Rows_no. :{raw_dataset_manipulations.num_rows}")
print(f"Columns_no. :{raw_dataset_manipulations.num_columns}")
print(raw_dataset_manipulations['train'][:3])

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews', 'review_length'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews', 'review_length'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews', 'review_length'],
        num_rows: 781
    })
})
Rows_no. :{'train': 12135, 'validation': 4421, 'test': 781}
Columns_no. :{'train': 3, 'validation': 3, 'test': 3}
{'labels': ['positive', 'neutral', 'positive'], 'reviews': ['absolutely love the sound', 'i can hear everything even when my game is on loud do not buy for noise cancelling rated stars for false advertising', 'pretty solid keyboard quiet and nice feel to it seems to be constructed well '], 'review_length': [4, 22, 14]}


> 🙋 An alternative way to add new columns to a dataset is with the **Dataset.add_column() function**. This allows you to provide the column as a Python list or NumPy array and can be handy in situations where Dataset.map() is not well suited for your analysis.

In [26]:
# Removing rows with review_length > 30
raw_dataset_manipulations_30 = raw_dataset_manipulations.filter(lambda x: x["review_length"] > 30)
print(f"Rows_no. :{raw_dataset_manipulations_30.num_rows}")
print(f"Columns_no. :{raw_dataset_manipulations_30.num_columns}")

Filter:   0%|          | 0/12135 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4421 [00:00<?, ? examples/s]

Filter:   0%|          | 0/781 [00:00<?, ? examples/s]

Rows_no. :{'train': 4078, 'validation': 1542, 'test': 253}
Columns_no. :{'train': 3, 'validation': 3, 'test': 3}


**Remove columns**

In [27]:
raw_dataset_manipulations = raw_dataset_manipulations.remove_columns(['review_length'])
print(raw_dataset_manipulations)
print(f"Rows_no. :{raw_dataset_manipulations.num_rows}")
print(f"Columns_no. :{raw_dataset_manipulations.num_columns}")
print(raw_dataset_manipulations['train'][:3])

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})
Rows_no. :{'train': 12135, 'validation': 4421, 'test': 781}
Columns_no. :{'train': 2, 'validation': 2, 'test': 2}
{'labels': ['positive', 'neutral', 'positive'], 'reviews': ['absolutely love the sound', 'i can hear everything even when my game is on loud do not buy for noise cancelling rated stars for false advertising', 'pretty solid keyboard quiet and nice feel to it seems to be constructed well ']}


**map() superpowers**

In [28]:
maper_powers = raw_dataset_manipulations
maper_powers_batched = raw_dataset_manipulations

In [29]:
%%time
maper_powers = maper_powers.map(
    lambda x: {"reviews_length": 
    [ len(review.split()) for review in x['reviews']]
    }
)
print(maper_powers.column_names)

# # CPU times: total: 1.16 s
# # Wall time: 2.55 s

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

{'train': ['labels', 'reviews', 'reviews_length'], 'validation': ['labels', 'reviews', 'reviews_length'], 'test': ['labels', 'reviews', 'reviews_length']}
CPU times: total: 1.16 s
Wall time: 2.55 s


In [30]:
%%time
map_batched = maper_powers_batched.map(
    lambda x: {"reviews_length": 
    [ len(review.split()) for review in x['reviews']]
    },
    batched=True
)
print(map_batched.column_names)

# # CPU times: total: 46.9 ms
# # Wall time: 250 ms

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

{'train': ['labels', 'reviews', 'reviews_length'], 'validation': ['labels', 'reviews', 'reviews_length'], 'test': ['labels', 'reviews', 'reviews_length']}
CPU times: total: 46.9 ms
Wall time: 250 ms


In [33]:
map_batched = map_batched.remove_columns(['reviews_length'])
print(map_batched)
print(f"Rows_no. :{map_batched.num_rows}")
print(f"Columns_no. :{map_batched.num_columns}")
print(map_batched['train'][:3])

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})
Rows_no. :{'train': 12135, 'validation': 4421, 'test': 781}
Columns_no. :{'train': 2, 'validation': 2, 'test': 2}
{'labels': ['positive', 'neutral', 'positive'], 'reviews': ['absolutely love the sound', 'i can hear everything even when my game is on loud do not buy for noise cancelling rated stars for false advertising', 'pretty solid keyboard quiet and nice feel to it seems to be constructed well ']}


In [36]:
# model_checkpoint = "bert_base_cased"
# tokeniser_checkpoint = "bert_base_cased"
model_checkpoint = "../using_transformers/models/bert_base_cased"
tokeniser_checkpoint = "../using_transformers/tokenizers/bert_base_cased"
model = TFAutoModel.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(tokeniser_checkpoint)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at ../using_transformers/models/bert_base_cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [38]:
def tokenize_function(example):
    return tokenizer(example["reviews"], truncation=True)

In [39]:
# Batch_size default is 1000
%time tokenized_datasets = map_batched.map(tokenize_function,batch_size=True)
# # CPU times: total: 2.88 s
# # Wall time: 6.27 s

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

CPU times: total: 2.88 s
Wall time: 6.27 s


In [44]:
%time tokenized_datasets = map_batched.map(tokenize_function,batched=True,batch_size=100)
# # CPU times: total: 797 ms
# # Wall time: 2.16 s

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

CPU times: total: 797 ms
Wall time: 2.16 s


In [49]:
print(tokenized_datasets.column_names)
print(tokenized_datasets.num_columns)

{'train': ['labels', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['labels', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['labels', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask']}
{'train': 5, 'validation': 5, 'test': 5}


Dataset.map() also has some parallelization capabilities of its own. Since they are not backed by Rust, they won’t let a slow tokenizer catch up with a fast one, but they can still be helpful (especially if you’re using a tokenizer that doesn’t have a fast version). To enable multiprocessing, use the num_proc argument and specify the number of processes to use in your call to Dataset.map():

>💡 In machine learning, an example is usually defined as the set of features that we feed to the model. In some contexts, these features will be the set of columns in a Dataset, but in others (like here and for question answering), multiple features can be extracted from a single example and belong to a single column.

Let’s have a look at how it works! Here we will tokenize our examples and truncate them to a maximum length of 128, but we will ask the tokenizer to return all the chunks of the texts instead of just the first one. This can be done with return_overflowing_tokens=True:

In [68]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["reviews"],
        truncation=True,
        max_length=4,
        return_overflowing_tokens=True,
    )

In [69]:
map_batched['train'][0]

{'labels': 'positive', 'reviews': 'absolutely love the sound'}

In [70]:
result = tokenize_and_split(map_batched['train'][0])
print(result)

{'input_ids': [[101, 7284, 1567, 102], [101, 1103, 1839, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0]}


In [76]:
[inp for inp in result['input_ids']]

[[101, 7284, 1567, 102], [101, 1103, 1839, 102]]

In [75]:
# [tokenizer.convert_ids_to_tokens(inp) 
#     for inp in result['input_ids']]
# [['[CLS]', 'absolutely', 'love', '[SEP]'], ['[CLS]', 'the', 'sound', '[SEP]']]

In [77]:
[len(inp) for inp in result['input_ids']]

[4, 4]

In [78]:
[tokenizer.decode(inp)
    for inp in result['input_ids']]

['[CLS] absolutely love [SEP]', '[CLS] the sound [SEP]']

In the context of the Hugging Face Transformers library and tokenization, **return_overflowing_tokens** is a parameter that you can use when tokenizing text data. It controls whether the tokenization process returns both the tokens that fit within the model's maximum sequence length and the tokens that exceed this length.

When you tokenize a long text sequence that doesn't fit within the model's maximum sequence length, the sequence is split into multiple smaller sequences (subtokens) that fit within the length constraint. The **return_overflowing_tokens** parameter determines whether the tokenizer should also return the additional subtokens beyond the maximum sequence length.

Here's how the parameter works:

If **return_overflowing_tokens** is set to False (the default), the tokenizer will only return the tokens that fit within the maximum sequence length. The overflowing tokens will be truncated, and the tokenizer will not provide them in the output.

If **return_overflowing_tokens** is set to True, the tokenizer will return both the tokens that fit within the maximum sequence length and the overflowing tokens. This can be useful if you want to perform additional processing on the overflowing tokens or analyze how the text was split into smaller segments.

In [79]:
# Applying to the whoe dataset
def tokenize_and_split(examples):
    return tokenizer(
        examples["reviews"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [80]:
tokenised_data = map_batched.map(tokenize_and_split,batched=True)

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

ArrowInvalid: Column 2 named input_ids expected length 1000 but got length 1029

Oh no! That didn’t work! Why not? Looking at the error message will give us a clue: there is a mismatch in the lengths of one of the columns, one being of length 1,463 and the other of length 1,000. If you’ve looked at the Dataset.map() documentation, you may recall that it’s the number of samples passed to the function that we are mapping; here those 1,000 examples gave 1,463 new features, resulting in a shape error.

The problem is that we’re trying to mix two different datasets of different sizes: the drug_dataset columns will have a certain number of examples (the 1,000 in our error), but the tokenized_dataset we are building will have more (the 1,463 in the error message; it is more than 1,000 because we are tokenizing long reviews into more than one example by using return_overflowing_tokens=True). That doesn’t work for a Dataset, so we need to either remove the columns from the old dataset or make them the same size as they are in the new dataset. We can do the former with the remove_columns argument

In [82]:
map_batched.column_names

{'train': ['labels', 'reviews'],
 'validation': ['labels', 'reviews'],
 'test': ['labels', 'reviews']}

In [85]:
tokenised_data = map_batched.map(tokenize_and_split,
                batched=True,
                remove_columns=map_batched["train"].column_names
                )

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

In [86]:
tokenised_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 12520
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 4588
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 793
    })
})

In [102]:
print(tokenised_data['train']['overflow_to_sample_mapping'][:4])
print(tokenised_data['train']['input_ids'][:3])

[0, 1, 2, 3]
[[101, 7284, 1567, 1103, 1839, 102], [101, 178, 1169, 2100, 1917, 1256, 1165, 1139, 1342, 1110, 1113, 4632, 1202, 1136, 4417, 1111, 4647, 19722, 1979, 6317, 2940, 1111, 6014, 6437, 102], [101, 2785, 4600, 9303, 3589, 1105, 3505, 1631, 1106, 1122, 3093, 1106, 1129, 3033, 1218, 102]]


In [96]:
map_batched['train']['reviews'][:3]

['absolutely love the sound',
 'i can hear everything even when my game is on loud do not buy for noise cancelling rated stars for false advertising',
 'pretty solid keyboard quiet and nice feel to it seems to be constructed well ']

In [95]:
[tokenizer.decode(id) for id in tokenised_data['train']['input_ids'][:3]]

['[CLS] absolutely love the sound [SEP]',
 "[CLS] i can hear everything even when my game is on loud don't buy for noise cancelling rated stars for false advertising [SEP]",
 '[CLS] pretty solid keyboard quiet and nice feel to it seems to be constructed well [SEP]']

Now this works without error. We can check that our new dataset has many more elements than the original dataset by comparing the lengths:

In [87]:
len(tokenised_data["train"]), len(map_batched["train"])

(12520, 12135)

We mentioned that we can also deal with the mismatched length problem by making the old columns the same size as the new ones. To do this, we will need the overflow_to_sample_mapping field the tokenizer returns when we set return_overflowing_tokens=True. It gives us a mapping from a new feature index to the index of the sample it originated from. Using this, we can associate each key present in our original dataset with a list of values of the right size by repeating the values of each example as many times as it generates new features:

In [106]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["reviews"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [107]:
tokenised_data = map_batched.map(tokenize_and_split,
                batched=True
                )

Map:   0%|          | 0/12135 [00:00<?, ? examples/s]

Map:   0%|          | 0/4421 [00:00<?, ? examples/s]

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

In [108]:
tokenised_data

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12520
    })
    validation: Dataset({
        features: ['labels', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4588
    })
    test: Dataset({
        features: ['labels', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 793
    })
})

In [109]:
len(tokenised_data["train"]), len(map_batched["train"])

(12520, 12135)

In [110]:
print(tokenised_data['train']['input_ids'][:3])
print(tokenised_data['train']['reviews'][:3])

[[101, 7284, 1567, 1103, 1839, 102], [101, 178, 1169, 2100, 1917, 1256, 1165, 1139, 1342, 1110, 1113, 4632, 1202, 1136, 4417, 1111, 4647, 19722, 1979, 6317, 2940, 1111, 6014, 6437, 102], [101, 2785, 4600, 9303, 3589, 1105, 3505, 1631, 1106, 1122, 3093, 1106, 1129, 3033, 1218, 102]]
['absolutely love the sound', 'i can hear everything even when my game is on loud do not buy for noise cancelling rated stars for false advertising', 'pretty solid keyboard quiet and nice feel to it seems to be constructed well ']


We get the same number of training features as before, but here we’ve kept all the old fields. If you need them for some post-processing after applying your model, you might want to use this approach.

You’ve now seen how 🤗 Datasets can be used to preprocess a dataset in various ways. Although the processing functions of 🤗 Datasets will cover most of your model training needs, there may be times when you’ll need to switch to Pandas to access more powerful features, like DataFrame.groupby() or high-level APIs for visualization. Fortunately, 🤗 Datasets is designed to be interoperable with libraries such as Pandas, NumPy, PyTorch, TensorFlow, and JAX. Let’s take a look at how this works

**From Datasets to DataFrames and back**

In [112]:
map_batched_frames = map_batched 
print(map_batched_frames)

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})


In [114]:
map_batched_frames.set_format("pandas")

DatasetDict({
    train: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 12135
    })
    validation: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 4421
    })
    test: Dataset({
        features: ['labels', 'reviews'],
        num_rows: 781
    })
})

>🚨 Under the hood, Dataset.set_format() changes the return format for the dataset’s __getitem__() dunder method. This means that when we want to create a new object like train_df from a Dataset in the "pandas" format, we need to slice the whole dataset to obtain a pandas.DataFrame. You can verify for yourself that the type of drug_dataset["train"] is Dataset, irrespective of the output format.

In [116]:
map_batched_frames['train'][:3]

Unnamed: 0,labels,reviews
0,positive,absolutely love the sound
1,neutral,i can hear everything even when my game is on ...
2,positive,pretty solid keyboard quiet and nice feel to i...


In [118]:
train_df = map_batched_frames['train'][:]
train_df

Unnamed: 0,labels,reviews
0,positive,absolutely love the sound
1,neutral,i can hear everything even when my game is on ...
2,positive,pretty solid keyboard quiet and nice feel to i...
3,neutral,i have one and my grandson liked it so got him...
4,positive,i got this to better hear players in ps titles...
...,...,...
12130,positive,i love my speaker this was actually my second ...
12131,positive,love my sound box
12132,positive,i chose this ergonomic mouse because liked the...
12133,neutral,i been using the speaker for while and haven e...


In [121]:
freq = (
    train_df['labels'].value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index":"labels","labels":"labels_counts"})
)
freq.head()

Unnamed: 0,labels,labels_counts
0,positive,6651
1,neutral,4410
2,negative,1074


In [122]:
from datasets import Dataset
freq_dataset = Dataset.from_pandas(freq)
freq_dataset

Dataset({
    features: ['labels', 'labels_counts'],
    num_rows: 3
})