# Data Exploration

## Setup & Imports

In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from twitter_emoji_reaction_lora.data import load_emoji_dataset, tokenize_and_format

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

login(token=os.getenv("HUGGINGFACE_TOKEN"))

## Download and inspect the dataset

In [3]:
ds = load_emoji_dataset()

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [5]:
ds["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 45000
})

## Tokenise and Reformat the Label Column

In [6]:
ds_tok, tok = tokenize_and_format(ds)

Map: 100%|██████████| 45000/45000 [00:03<00:00, 14750.75 examples/s]
Map: 100%|██████████| 50000/50000 [00:05<00:00, 9843.64 examples/s] 
Map: 100%|██████████| 5000/5000 [00:00<00:00, 7981.55 examples/s] 


In [7]:
ds_tok

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})