In [1]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (782 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m782.7/782.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m435.0/435.0 kB[0m [31m6.5 MB/s[

In [2]:
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm
import argparse
import os
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
s3_path = 's3://hugging-face-text-multiclass-text-classification-bucket/training_data/newsCorpora.csv'
df = pd.read_csv(s3_path, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# remember we want to classify the title based on the category

df = df[['TITLE', 'CATEGORY']]


my_dict = {
    'e':'Entertainment',
    'b':'Business',
    't':'Science',
    'm':'Health'
}

# Change the 'CATEGORY' columns' cryptic symbols to accord with more readable dictionary. x in the function represents symbols
def update_cat(x):
    return my_dict[x]

# Update the CATEGORY column
df['CATEGORY'] = df['CATEGORY'].apply(lambda x: update_cat(x))

print(df)

                                                    TITLE  CATEGORY
0       Fed official says weak data caused by weather,...  Business
1       Fed's Charles Plosser sees high bar for change...  Business
2       US open: Stocks fall after Fed official hints ...  Business
3       Fed risks falling 'behind the curve', Charles ...  Business
4       Fed's Plosser: Nasty Weather Has Curbed Job Gr...  Business
...                                                   ...       ...
422414  Surgeons to remove 4-year-old's rib to rebuild...    Health
422415  Boy to have surgery on esophagus after battery...    Health
422416  Child who swallowed battery to have reconstruc...    Health
422417  Phoenix boy undergoes surgery to repair throat...    Health
422418  Phoenix boy undergoes surgery to repair throat...    Health

[422419 rows x 2 columns]


In [None]:
# # Training the model with 5% of the subset just to make sure the model can train

# df = df.sample(frac=0.05, random_state=1)

# df = df.reset_index(drop=True)

# df

In [7]:
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x] = len(encode_dict)
    return encode_dict[x]

In [8]:
## Apply the encoding to the 'CATEGORY' column
df['ENCODE_CAT'] = df['CATEGORY'].apply(lambda x: encode_cat(x))


In [9]:
# Take a look at data now after this transformation

df

Unnamed: 0,TITLE,CATEGORY,ENCODE_CAT
0,"Fed official says weak data caused by weather,...",Business,0
1,Fed's Charles Plosser sees high bar for change...,Business,0
2,US open: Stocks fall after Fed official hints ...,Business,0
3,"Fed risks falling 'behind the curve', Charles ...",Business,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,Business,0
...,...,...,...
422414,Surgeons to remove 4-year-old's rib to rebuild...,Health,3
422415,Boy to have surgery on esophagus after battery...,Health,3
422416,Child who swallowed battery to have reconstruc...,Health,3
422417,Phoenix boy undergoes surgery to repair throat...,Health,3


In [10]:
encode_dict.items()

dict_items([('Business', 0), ('Science', 1), ('Entertainment', 2), ('Health', 3)])

**Here’s a simple breakdown of our encoding:**

1. Check if `x` is in `encode_dict`: The function `encode_cat(x)` looks at each item, `x`, that you want to encode.

2. If `x` is not already in `encode_dict`, it gives it a new number by assigning it the next available value, which is the current length of `encode_dict`.

This way, each unique item gets a unique number as you go.
Return the number: The function then returns the number (or "code") assigned to x.

This is a quick way to make sure each category gets a unique numeric code on the spot without any duplicates. Let me know if this makes it clearer

## Tokenization and Ecoding of Sentences in General

In [11]:
pip show transformers

Name: transformers
Version: 4.46.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /opt/conda/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [14]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

inputs = tokenizer.encode_plus(
    "I love baseball and the yankees in particular",
    "I love playing my Telecaster",
    add_special_tokens=True,
    max_length=20,
    padding='max_length',
    truncation=True,
    return_token_type_ids=True,
    return_attention_mask=True
)

print('Input IDs:', inputs['input_ids'])
print('Attention Mask:', inputs['attention_mask'])
print('Token Type IDs:', inputs['token_type_ids'])


Input IDs: [101, 1045, 2293, 3598, 1998, 1996, 11081, 1999, 3327, 102, 1045, 2293, 2652, 2026, 28803, 2121, 102, 0, 0, 0]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
Token Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
