In [None]:
!pip install transformers datasets

### Import datasets - Hugginface library

In [None]:
!pip install datasets

In [None]:
from datasets import list_datasets

In [None]:
all_datasets = list_datasets()

In [None]:
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")

### Let's load emotion dataset

In [None]:
from datasets import load_dataset

In [None]:
emotions = load_dataset('emotion')

In [None]:
emotions

In [None]:
train_ds = emotions["train"]

In [None]:
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds[0]

In [None]:
train_ds.column_names

In [None]:
train_ds.features

### From datasets to DataFrames

In [None]:
import pandas as pd

In [None]:
emotions.set_format(type="pandas")

In [None]:
df = emotions["train"][:]

In [None]:
def label2str(row):
    return emotions["train"].features['label'].int2str(row)

In [None]:
df["label name"] = df["label"].apply(label2str)

In [None]:
df.head()

### Looking at the Class Distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
df['label name'].value_counts(ascending=True).plot().barh()
plt.title("Frequency of classes")

### How long are the Tweets?

In [None]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)

In [None]:
df["Words Per Tweet"]

In [None]:
df.boxplot("Words Per Tweet", by="label name", grid=False, showfliers=False, color="black")
plt.show()

### Character Tokenization

In [1]:
text = "This is a sample text for Transformer neural network"

In [2]:
tokenized_text = list(text)
print(tokenized_text)

['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'a', 'm', 'p', 'l', 'e', ' ', 't', 'e', 'x', 't', ' ', 'f', 'o', 'r', ' ', 'T', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', 'e', 'r', ' ', 'n', 'e', 'u', 'r', 'a', 'l', ' ', 'n', 'e', 't', 'w', 'o', 'r', 'k']


#### Make a token2idx dictionary

In [3]:
#{key: value for (key, value) in iterable} <-> Dict comprehension
token2idx = {char: idx for (char, idx) in enumerate(sorted(tokenized_text))}

In [4]:
token2idx

{0: ' ',
 1: ' ',
 2: ' ',
 3: ' ',
 4: ' ',
 5: ' ',
 6: ' ',
 7: ' ',
 8: 'T',
 9: 'T',
 10: 'a',
 11: 'a',
 12: 'a',
 13: 'a',
 14: 'e',
 15: 'e',
 16: 'e',
 17: 'e',
 18: 'e',
 19: 'f',
 20: 'f',
 21: 'h',
 22: 'i',
 23: 'i',
 24: 'k',
 25: 'l',
 26: 'l',
 27: 'm',
 28: 'm',
 29: 'n',
 30: 'n',
 31: 'n',
 32: 'o',
 33: 'o',
 34: 'o',
 35: 'p',
 36: 'r',
 37: 'r',
 38: 'r',
 39: 'r',
 40: 'r',
 41: 'r',
 42: 's',
 43: 's',
 44: 's',
 45: 's',
 46: 't',
 47: 't',
 48: 't',
 49: 'u',
 50: 'w',
 51: 'x'}

In [5]:
token2idx

{0: ' ',
 1: ' ',
 2: ' ',
 3: ' ',
 4: ' ',
 5: ' ',
 6: ' ',
 7: ' ',
 8: 'T',
 9: 'T',
 10: 'a',
 11: 'a',
 12: 'a',
 13: 'a',
 14: 'e',
 15: 'e',
 16: 'e',
 17: 'e',
 18: 'e',
 19: 'f',
 20: 'f',
 21: 'h',
 22: 'i',
 23: 'i',
 24: 'k',
 25: 'l',
 26: 'l',
 27: 'm',
 28: 'm',
 29: 'n',
 30: 'n',
 31: 'n',
 32: 'o',
 33: 'o',
 34: 'o',
 35: 'p',
 36: 'r',
 37: 'r',
 38: 'r',
 39: 'r',
 40: 'r',
 41: 'r',
 42: 's',
 43: 's',
 44: 's',
 45: 's',
 46: 't',
 47: 't',
 48: 't',
 49: 'u',
 50: 'w',
 51: 'x'}

In [6]:
token2idx = {char: idx for (idx, char) in enumerate(sorted(set(tokenized_text)))}

In [7]:
token2idx

{' ': 0,
 'T': 1,
 'a': 2,
 'e': 3,
 'f': 4,
 'h': 5,
 'i': 6,
 'k': 7,
 'l': 8,
 'm': 9,
 'n': 10,
 'o': 11,
 'p': 12,
 'r': 13,
 's': 14,
 't': 15,
 'u': 16,
 'w': 17,
 'x': 18}

#### Make a list of indexes to the tokenized_text

In [8]:
input_ids = [token2idx[char] for char in tokenized_text]

In [9]:
print(input_ids)

[1, 5, 6, 14, 0, 6, 14, 0, 2, 0, 14, 2, 9, 12, 8, 3, 0, 15, 3, 18, 15, 0, 4, 11, 13, 0, 1, 13, 2, 10, 14, 4, 11, 13, 9, 3, 13, 0, 10, 3, 16, 13, 2, 8, 0, 10, 3, 15, 17, 11, 13, 7]


#### ONE HOT Encodings

In [10]:
import torch
import torch.nn.functional as F

In [11]:
input_ids_tensor = torch.tensor(input_ids)

In [12]:
one_hot_encoding = F.one_hot(input_ids_tensor, num_classes=len(token2idx))

In [13]:
one_hot_encoding.shape

torch.Size([52, 19])

In [19]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids_tensor[0]}")
print(f"One-hot: {one_hot_encoding[0]}")

Token: T
Tensor index: 1
One-hot: tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


### Word based tokenization

In [1]:
text = "This is a sample Transformer neural network input text"

In [2]:
tokenized_text = text.split()

In [3]:
print(tokenized_text)

['This', 'is', 'a', 'sample', 'Transformer', 'neural', 'network', 'input', 'text']


Goals list:

1. token2idx dictionary
2. list of indexies of tokens as list
3. one-hot encoding of that above list

In [11]:
token2idx = {idx: token for token, idx in enumerate(sorted(set(tokenized_text)))}

In [12]:
print(token2idx)

{'This': 0, 'Transformer': 1, 'a': 2, 'input': 3, 'is': 4, 'network': 5, 'neural': 6, 'sample': 7, 'text': 8}


In [13]:
input_indexies = [token2idx[token] for token in tokenized_text]

In [14]:
print(input_indexies)

[0, 4, 2, 7, 1, 6, 5, 3, 8]


In [15]:
import torch
import torch.nn.functional as F

In [17]:
tensor_input = torch.tensor(input_indexies)
one_hot_word_based = F.one_hot(tensor_input, len(tokenized_text))
print(one_hot_word_based)
print(one_hot_word_based.shape)

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1]])
torch.Size([9, 9])
