In [1]:
import torch
from datasets import load_dataset
import re

In [2]:
# Set the seed
seed = 42
torch.manual_seed(seed)
# Probably, this below must be changed if you work with a M1/M2/M3 Mac
torch.cuda.manual_seed(seed) # for CUDA
torch.backends.cudnn.deterministic = True # for CUDNN
torch.backends.benchmark = False # if True, causes cuDNN to benchmark multiple convolution algorithms and select the fastest.

## Data

### Question 1

In [3]:
# Question 1
ds = load_dataset("heegyu/news-category-dataset")
print(ds['train'])

Dataset({
    features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
    num_rows: 209527
})


### Question 2

In [4]:
# Question 2
# Filter for "POLITICS" category and store each headline as a string in ds_train
ds_train = [news['headline'] for news in ds['train'] if news['category'] == 'POLITICS']

assert len(ds_train) == 35602

print("First headline (before processing):", ds_train[0])

First headline (before processing): Biden Says U.S. Forces Would Defend Taiwan If China Invaded


### Question 3

In [5]:
# Convert each headline to lowercase
ds_train = [headline.lower() for headline in ds_train]

# Check the result
print(ds_train[0])

# Split each headline in words
# maybe I could use a better tokenizer (ex. remove all punctation)
ds_train = [headline.split(" ") for headline in ds_train]

# Check the result
print(ds_train[0])

biden says u.s. forces would defend taiwan if china invaded
['biden', 'says', 'u.s.', 'forces', 'would', 'defend', 'taiwan', 'if', 'china', 'invaded']


In [6]:
# Add <EOS> at the end of every headline
for headline in ds_train:
    headline.append('<EOS>')

# Check the result
print(ds_train[0])

['biden', 'says', 'u.s.', 'forces', 'would', 'defend', 'taiwan', 'if', 'china', 'invaded', '<EOS>']


In [7]:
print(ds_train[:5])

[['biden', 'says', 'u.s.', 'forces', 'would', 'defend', 'taiwan', 'if', 'china', 'invaded', '<EOS>'], ['‘beautiful', 'and', 'sad', 'at', 'the', 'same', 'time’:', 'ukrainian', 'cultural', 'festival', 'takes', 'on', 'a', 'deeper', 'meaning', 'this', 'year', '<EOS>'], ['biden', 'says', "queen's", 'death', 'left', "'giant", "hole'", 'for', 'royal', 'family', '<EOS>'], ['bill', 'to', 'help', 'afghans', 'who', 'escaped', 'taliban', 'faces', 'long', 'odds', 'in', 'the', 'senate', '<EOS>'], ['mark', 'meadows', 'complies', 'with', 'justice', 'dept.', 'subpoena:', 'report', '<EOS>']]


### Question 4

In [23]:
# Flatten ds_train and extract unique words 
unique_words = set(word for headline in ds_train for word in headline)

# Create vocabulary with <EOS> at the beginning and PAD at the end and remove evenutally alredy presents special tokens
unique_words = {word for word in unique_words if word and word not in ["<EOS>", "PAD"]}

# Sorting of unique_words
word_vocab = ["<EOS>"] + sorted(list(unique_words)) + ["PAD"]

# Dictionary representing a mapping from words of our word_vocab to integer values
word_to_int = {word: i for i, word in enumerate(word_vocab)}

print(f"<EOS> index: {word_to_int['<EOS>']}")
print(f"PAD index: {word_to_int['PAD']}")
print("Sample mapping:", list(word_to_int.items())[:10])  # Print first 10 mappings

# Dictionary representing the inverse of `word_to_int`, i.e. a mapping from integer (keys) to characters (values).
int_to_word = {word:i for i, word in word_to_int.items()}
print(f"Word at first index: {int_to_word[0]}")
print(f"Word at first index: {int_to_word[len(word_vocab)-1]}")
print("Sample mapping:", list(int_to_word.items())[:10])  # Print first 10 mappings


<EOS> index: 0
PAD index: 33231
Sample mapping: [('<EOS>', 0), ('"100', 1), ('"a', 2), ('"ace', 3), ('"activism":', 4), ('"advanced"', 5), ('"aleppo"', 6), ('"all', 7), ('"all-important"', 8), ('"alone"', 9)]
Word at first index: <EOS>
Word at first index: PAD
Sample mapping: [(0, '<EOS>'), (1, '"100'), (2, '"a'), (3, '"ace'), (4, '"activism":'), (5, '"advanced"'), (6, '"aleppo"'), (7, '"all'), (8, '"all-important"'), (9, '"alone"')]


In [16]:
# The 5 most common words

# The number of words you ended up with

{0: '<EOS>',
 1: '"100',
 2: '"a',
 3: '"ace',
 4: '"activism":',
 5: '"advanced"',
 6: '"aleppo"',
 7: '"all',
 8: '"all-important"',
 9: '"alone"',
 10: '"amazing',
 11: '"america',
 12: '"black',
 13: '"but"',
 14: '"can\'t',
 15: '"central',
 16: '"conservatives',
 17: '"d"s',
 18: '"debts,"',
 19: '"defense"',
 20: '"degenerates"?',
 21: '"disrupters"',
 22: '"evil',
 23: '"extreme',
 24: '"f*ck',
 25: '"fabricating',
 26: '"faithkeepers"',
 27: '"flowers',
 28: '"frustrated"',
 29: '"full',
 30: '"good"',
 31: '"hardliners",',
 32: '"i',
 33: '"i"',
 34: '"i\'m',
 35: '"immersive',
 36: '"immigration',
 37: '"impasse"',
 38: '"insiders"',
 39: '"intentionally',
 40: '"island',
 41: '"jokes,"',
 42: '"killed"',
 43: '"lemons',
 44: '"let\'s',
 45: '"libertarian"',
 46: '"living"',
 47: '"macron',
 48: '"make',
 49: '"mclaughlin',
 50: '"moderate',
 51: '"moderates",',
 52: '"more',
 53: '"most',
 54: '"nasty"',
 55: '"national',
 56: '"new',
 57: '"no"',
 58: '"oprah',
 59: '"part

Sample mapping: [('<EOS>', 0), ('"100', 1), ('"a', 2), ('"ace', 3), ('"activism":', 4), ('"advanced"', 5), ('"aleppo"', 6), ('"all', 7), ('"all-important"', 8), ('"alone"', 9)]
