### Part 1. Loading and Preprocessing Data
The following cell loads the Book Summaries dataset, and tokenizes each data item

In [1]:
import torch
import random
import numpy as np
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
from src.preprocess import clean_text
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

nltk.download('punkt')

# Read in the Book Summaries dataset
df = pd.read_csv('data/booksummaries.txt', header=None, sep='\t')
df.columns = ['wikipedia_article_id', 'freebase_id', 'title', 'author', 'pub_date', 'genre', 'summary']
# Remove extraneous features - wikipedia_article_id, freebase_id, author, pub_date
df = df.drop(labels=['wikipedia_article_id', 'freebase_id', 'author', 'pub_date'], axis=1)
# Tokenize text fields
df["tokenized_title"] = df["title"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))
df["tokenized_summary"] = df["summary"].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cindyzastudil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
print('Size of dataset before preprocessing:', len(df))

# Remove any books which don't have genres
df.dropna(subset=['genre'], inplace=True)

print('Size of dataset after removing missing genres:', len(df))

# Remove any books which don't have titles
df.dropna(subset=['title'], inplace=True)

print('Size of dataset after removing missing titles:', len(df))

# Remove any books which don't have summaries
df.dropna(subset=['summary'], inplace=True)

print('Size of dataset after removing missing summaries:', len(df))
print('Size of dataset after preprocessing:', len(df))

# Remove all stop words from all summaries & titles
stop = stopwords.words('english')
df['tokenized_summary'] = df['tokenized_summary'].apply(lambda x: [word for word in x if word not in (stop)])
df['tokenized_title'] = df['tokenized_title'].apply(lambda x: [word for word in x if word not in (stop)])

print(df.head())

Size of dataset before preprocessing: 16559
Size of dataset after removing missing genres: 12841
Size of dataset after removing missing titles: 12841
Size of dataset after removing missing summaries: 12841
Size of dataset after preprocessing: 12841
                            title  \
0                     Animal Farm   
1              A Clockwork Orange   
2                      The Plague   
4            A Fire Upon the Deep   
5  All Quiet on the Western Front   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   
5  {"/m/098tmk": "War novel", "/m/016lj8": "Roman...   

                                             summary          tokenized_title  \
0   Old Major, the old boar on the Manor Farm, ca...           [animal, farm]   
1   Alex, a teenager living i

In [4]:
# Format the genre field
formatted_genres = []
genre_dict = dict()
for g in df['genre']:
    subg = []
    genre_dict = eval(g)
    for k in genre_dict.keys():
        subg.append(genre_dict[k])
    formatted_genres.append(subg)
df['formatted_genre'] = formatted_genres
print(df.head())

                            title  \
0                     Animal Farm   
1              A Clockwork Orange   
2                      The Plague   
4            A Fire Upon the Deep   
5  All Quiet on the Western Front   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   
5  {"/m/098tmk": "War novel", "/m/016lj8": "Roman...   

                                             summary          tokenized_title  \
0   Old Major, the old boar on the Manor Farm, ca...           [animal, farm]   
1   Alex, a teenager living in near-future Englan...      [clockwork, orange]   
2   The text of The Plague is divided into five p...                 [plague]   
4   The novel posits that space around the Milky ...       [fire, upon, deep]   
5   The book tells the story of Pau

In [5]:
# Select one genre for each book
genres = []
for g in df['formatted_genre']:
    genres.append(g[0])
df['genre'] = genres
print(df.head())

                            title                 genre  \
0                     Animal Farm          Roman à clef   
1              A Clockwork Orange       Science Fiction   
2                      The Plague        Existentialism   
4            A Fire Upon the Deep  Hard science fiction   
5  All Quiet on the Western Front             War novel   

                                             summary          tokenized_title  \
0   Old Major, the old boar on the Manor Farm, ca...           [animal, farm]   
1   Alex, a teenager living in near-future Englan...      [clockwork, orange]   
2   The text of The Plague is divided into five p...                 [plague]   
4   The novel posits that space around the Milky ...       [fire, upon, deep]   
5   The book tells the story of Paul Bäumer, a Ge...  [quiet, western, front]   

                                   tokenized_summary  \
0  [old, major, ,, old, boar, manor, farm, ,, cal...   
1  [alex, ,, teenager, living, near-future, en

In [6]:
df["tokenized"] = df["tokenized_title"] + df["tokenized_summary"]
encoded_genre, unique_genres = pd.factorize(df["genre"])
df["encoded_genre"] = encoded_genre
print(df.head())

                            title                 genre  \
0                     Animal Farm          Roman à clef   
1              A Clockwork Orange       Science Fiction   
2                      The Plague        Existentialism   
4            A Fire Upon the Deep  Hard science fiction   
5  All Quiet on the Western Front             War novel   

                                             summary          tokenized_title  \
0   Old Major, the old boar on the Manor Farm, ca...           [animal, farm]   
1   Alex, a teenager living in near-future Englan...      [clockwork, orange]   
2   The text of The Plague is divided into five p...                 [plague]   
4   The novel posits that space around the Milky ...       [fire, upon, deep]   
5   The book tells the story of Paul Bäumer, a Ge...  [quiet, western, front]   

                                   tokenized_summary  \
0  [old, major, ,, old, boar, manor, farm, ,, cal...   
1  [alex, ,, teenager, living, near-future, en

In [7]:
df.iloc[42]

title                                                Time out of Joint
genre                                                  Science Fiction
summary               As the novel opens, its protagonist Ragle Gum...
tokenized_title                                          [time, joint]
tokenized_summary    [novel, opens, ,, protagonist, ragle, gumm, be...
formatted_genre      [Science Fiction, Speculative fiction, Fiction...
tokenized            [time, joint, novel, opens, ,, protagonist, ra...
encoded_genre                                                        1
Name: 62, dtype: object

Now that we've loaded this dataset, we need to split the data into train, validation, and test sets. We also need to create a vocab map for words in our Onion dataset, which will map tokens to numbers. This will be useful later, since torch models can only use tensors of sequences of numbers as inputs.

In [8]:
from src.dataset import split_train_val_test, generate_vocab_map
df = df.sample(frac=1)
train_df, val_df, test_df = split_train_val_test(df, props=[.8, .1, .1])
train_vocab, reverse_vocab = generate_vocab_map(train_df)

In [9]:
# this line of code will help test your implementation
print(len(val_df))
(len(train_df) / len(df)), (len(val_df) / len(df)), (len(test_df) / len(df))

1284


(0.7999376995561094, 0.09999221244451367, 0.100070087999377)

In [10]:
from src.dataset import SummaryDataset
from torch.utils.data import RandomSampler

train_dataset = SummaryDataset(train_vocab, train_df)
val_dataset = SummaryDataset(train_vocab, val_df)
test_dataset = SummaryDataset(train_vocab, test_df)

# Now that we're wrapping our dataframes in PyTorch datsets, we can make use of PyTorch Random Samplers.
train_sampler = RandomSampler(train_dataset)
val_sampler = RandomSampler(val_dataset)
test_sampler = RandomSampler(test_dataset)

In [11]:
from torch.utils.data import DataLoader
from src.dataset import collate_fn
BATCH_SIZE = 16

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [12]:
# Tests the collate_fn implementation
for x, y in test_iterator:
     print('BATCH:')
     print(x)
     print('SHAPE OF X:')
     print(x.shape)
     print('Y:')
     print(y)
     print('SHAPE OF Y:')
     print(y.shape)

BATCH:
tensor([[  347,   607,   347,  ...,   733,  6896,   185],
        [  542, 11865, 11892,  ...,     0,     0,     0],
        [37283, 19699,  4044,  ...,     0,     0,     0],
        ...,
        [ 2684,  1521, 26348,  ...,     0,     0,     0],
        [11461,     1,   867,  ...,     8,   511,  4183],
        [37538, 18831, 20190,  ...,   852, 33602,     8]])
SHAPE OF X:
torch.Size([16, 300])
Y:
tensor([ 23.,  10., 116.,  23.,   1.,  10.,   1.,   1.,  23.,  18.,  56.,   5.,
          5.,   7.,   7.,   5.])
SHAPE OF Y:
torch.Size([16])
BATCH:
tensor([[ 1984, 17109,  1984,  ...,     0,     0,     0],
        [ 5227,   371,   357,  ...,     0,     0,     0],
        [    1,     1,     8,  ...,     0,     0,     0],
        ...,
        [33647,     1,  1663,  ...,  2649,   136,   137],
        [ 2331,    11,  3297,  ...,     0,     0,     0],
        [    1,  2200,     1,  ...,     1,     8,  3548]])
SHAPE OF X:
torch.Size([16, 300])
Y:
tensor([57., 26.,  7., 14., 26.,  5.,  7., 15.

### Part 2: Modeling
Let's move to modeling, now that we have dataset iterators that batch our data for us. **Go to src/model.py, and follow the instructions in the file to create a basic neural network. Then, create your model using the class, and define hyperparameters.**

In [34]:
from src.models import MultiClassificationModel

model = None
model = MultiClassificationModel(vocab_size=len(train_vocab.keys()), embedding_dim=5, hidden_dim=1, num_layers=3, bidirectional=True)

In the following cell, **instantiate the model with some hyperparameters, and select an appropriate loss function and optimizer.**

Hint: we already use sigmoid in our model. What loss functions are availible for binary classification? Feel free to look at PyTorch docs for help!

In [35]:
from torch.optim import Adam

criterion, optimizer = None, None
criterion = torch.nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

### Part 3: Training and Evaluation
The final part of this HW involves training the model, and evaluating it at each epoch. **Fill out the train and test loops below.**

In [36]:
# returns the total loss calculated from criterion
def train_loop(model, criterion, iterator):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        # Shuffle data on every epoch
        indices = torch.randperm(x.size()[0])
        x=x[indices]
        y=y[indices]
        y_pred = model(x)
        #loss = criterion(y_pred, y)
        loss = criterion(y_pred, y.long())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
    return total_loss

# returns:
# - true: a Python boolean array of all the ground truth values
#         taken from the dataset iterator
# - pred: a Python boolean array of all model predictions.
def val_loop(model, criterion, iterator):
    true, pred = [], []
    for x, y in tqdm(iterator):
        for t in y:
            true.append(t)
        y_pred = torch.flatten(model(x).data).round()
        for p in y_pred:
            pred.append(p)
    return true, pred

We also need evaluation metrics that tell us how well our model is doing on the validation set at each epoch. **Complete the functions in src/eval.py.**

In [37]:
from src.eval_utils import accuracy
#from torcheval.metrics.functional import multiclass_f1_score
true, pred = val_loop(model, criterion, val_iterator)
#print(multiclass_f1_score(pred, true, num_classes=227))
print(accuracy(true, pred))

100%|███████████████████████████████████████████| 81/81 [00:05<00:00, 15.07it/s]

0.00014752905979387103





### Part 4: Actually training the model
Watch your model train :D You should be able to achieve a validation F-1 score of at least .8 if everything went correctly. **Feel free to adjust the number of epochs to prevent overfitting or underfitting.**

In [38]:
TOTAL_EPOCHS = 15
for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, train_iterator)
    true, pred = val_loop(model, criterion, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    #print(f"VAL F-1: {binary_macro_f1(true, pred)}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|█████████████████████████████████████████| 642/642 [02:10<00:00,  4.91it/s]
100%|███████████████████████████████████████████| 81/81 [00:06<00:00, 13.17it/s]


EPOCH: 0
TRAIN LOSS: 2920.230224609375
VAL ACC: 3.4309083672993264e-05


100%|█████████████████████████████████████████| 642/642 [02:17<00:00,  4.66it/s]
100%|███████████████████████████████████████████| 81/81 [00:07<00:00, 10.56it/s]


EPOCH: 1
TRAIN LOSS: 2166.01904296875
VAL ACC: 2.4016358571095283e-05


  3%|█▏                                        | 18/642 [00:03<02:16,  4.57it/s]


KeyboardInterrupt: 

We can also look at the models performance on the held-out test set, using the same val_loop we wrote earlier.

In [None]:
true, pred = val_loop(model, criterion, test_iterator)
print(f"TEST F-1: {binary_macro_f1(true, pred)}")
print(f"TEST ACC: {accuracy(true, pred)}")

### Part 5: Analysis
Answer the following questions:
#### 1. What happens to the vocab size as you change the cutoff in the cell below? Can you explain this in the context of [Zipf's Law](https://en.wikipedia.org/wiki/Zipf%27s_law)?

In [None]:
tmp_vocab, _ = generate_vocab_map(train_df, cutoff = 1)
print('Cutoff=1:', len(tmp_vocab))
tmp_vocab, _ = generate_vocab_map(train_df, cutoff = 2)
print('Cutoff=2:', len(tmp_vocab))

### Answer:

When the cuttoff is changed from 2 to 1, the vocab size increases by approximately 40%. In essence, Zipf's Law says that the frequency of the most common word occurs around 2 times as often as the next common word for all word frequencies. Additionally, this relationship holds because the most common words are a relatively small set of a language, whereas the vast majority of a language is less commonly used in comparison (Source: https://en.wikipedia.org/wiki/Zipf%27s_law#Word_frequencies_in_natural_languages). In this context, we can explain the increase in the size of the vocab because the majority of words are infrequently used compared to the most common words. If the cutoff is reduced by half (i.e., from 1 to 2), it would make sense that the size of the vocab doubles by nearly 50%.

#### 2. Can you describe what cases the model is getting wrong in the witheld test-set?

To do this, you'll need to create a new val_train_loop (``val_train_loop_incorrect``) so it returns incorrect sequences **and** you'll need to decode these sequences back into words.
Thankfully, you've already created a map that can convert encoded sequences back to regular English: you will find the ``reverse_vocab`` variable useful.

```
# i.e. using a reversed map of {"hi": 2, "hello": 3, "UNK": 0}
# we can turn [2, 3, 0] into this => ["hi", "hello", "UNK"]
```

In [None]:
def decode_sequence(encoded):
    decoded = []
    for i in encoded:
        decoded.append(reverse_vocab[i.item()])
    return decoded

# Implement this however you like! It should look very similar to val_loop.
# Pass the test_iterator through this function to look at errors in the test set.
def val_train_loop_incorrect(model, iterator):
    incorrect = []
    labels = []
    correct = []
    for x, y in tqdm(iterator):
        y_pred = torch.flatten(model(x).data).round()
        for i in range(len(y)):
            if y_pred[i] != y[i]:
                incorrect.append(decode_sequence(x[i]))
                labels.append((y_pred[i], y[i]))
            else:
                correct.append(decode_sequence(x[i]))
    return incorrect, correct, labels

In [None]:
incorrect, correct, labels = val_train_loop_incorrect(model, test_iterator)
inc_tokens = []
c_tokens = []
for x in incorrect:
    for i in x:
        inc_tokens.append(i)
for x in correct:
    for i in x:
        c_tokens.append(i)
incorrect_freq = nltk.probability.FreqDist(inc_tokens)
print('Most common words in incorrect results:', incorrect_freq.most_common(20))
correct_freq = nltk.probability.FreqDist(c_tokens)
print('Most common words in correct results:', correct_freq.most_common(20))
print('5 sample incorrect results:')
for i in range(5):
    print('Predicted Label:', labels[i][0])
    print('Actual Label:', labels[i][1])
    print(incorrect[i])


### Answer:

One reason the classifier may be predicting the class of some test examples incorrectly is a hihg prevalence of political tokens in headlines. This makes sense in this context as both real news and The Onion headlines would both be talking about political issues (factually and satirically). Additionally, I noticed a higher frequency of the 'UNK' token in what is being predicted correctly. I'm unsure of the actual reason that this may be; however, I hypothesize that more contextual words (i.e., words with a mapping) may make it mroe difficult for the classifier to predict them correctly. It also seems like the incorrect predictions have some amount of sarcasm and ambiguous use of language which is naturally more difficult to accurately classify.