## Imports

In [1]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import re
import torch

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cindyzastudil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cindyzastudil/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cindyzastudil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Read in the CMU Book Summary Dataset

The CMU Book Summary Dataset is a collection of 16,559 different book summaries extracted from Wikipedia.

In [2]:
# Read in the data and add columns to the dataframe
book_summs = pd.read_csv('data/booksummaries.txt', header=None, sep='\t')
book_summs.columns =['wikipedia_article_id', 'freebase_id', 'title', 'author', 'pub_date', 'genre', 'summary']

# Remove extraneous features - wikipedia_article_id, freebase_id, author, pub_date
book_summs = book_summs.drop(labels=['wikipedia_article_id', 'freebase_id', 'author', 'pub_date'], axis=1)
print(book_summs.head())

                                       title  \
0                                Animal Farm   
1                         A Clockwork Orange   
2                                 The Plague   
3  An Enquiry Concerning Human Understanding   
4                       A Fire Upon the Deep   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
3                                                NaN   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   

                                             summary  
0   Old Major, the old boar on the Manor Farm, ca...  
1   Alex, a teenager living in near-future Englan...  
2   The text of The Plague is divided into five p...  
3   The argument of the Enquiry proceeds by a ser...  
4   The novel posits that space around the Milky ...  


## Data Preprocessing

In [3]:
# example code taken from fast-bert
def spec_add_spaces(t: str) -> str:
    "Add spaces around / and # in `t`. \n"
    return re.sub(r"([/#\n])", r" \1 ", t)

def rm_useless_spaces(t: str) -> str:
    "Remove multiple spaces in `t`."
    return re.sub(" {2,}", " ", t)

def replace_multi_newline(t: str) -> str:
    return re.sub(r"(\n(\s)*){2,}", "\n", t)

def clean_text(input_text):
    text = replace_multi_newline(input_text)
    text = spec_add_spaces(text)
    text = rm_useless_spaces(text)
    text = text.strip()
    return text

In [4]:
print('Size of dataset before preprocessing:', len(book_summs))

# Remove any books which don't have genres
book_summs.dropna(subset=['genre'], inplace=True)

print('Size of dataset after removing missing genres:', len(book_summs))

# Remove any books which don't have titles
book_summs.dropna(subset=['title'], inplace=True)

print('Size of dataset after removing missing titles:', len(book_summs))

# Remove any books which don't have summaries
book_summs.dropna(subset=['summary'], inplace=True)

print('Size of dataset after removing missing summaries:', len(book_summs))
print('Size of dataset after preprocessing:', len(book_summs))

# Tokenize titles & convert to lower case - add to tokenized_title column
book_summs['tokenized_title'] = book_summs['title'].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

# Tokenize summaries & convert to lower case - add to tokenized_summary column
book_summs['tokenized_summary'] = book_summs['summary'].apply(lambda x: nltk.word_tokenize(clean_text(x.lower())))

# Remove all stop words from all summaries & titles
stop = stopwords.words('english')
book_summs['tokenized_summary'] = book_summs['tokenized_summary'].apply(lambda x: [word for word in x if word not in (stop)])
book_summs['tokenized_title'] = book_summs['tokenized_title'].apply(lambda x: [word for word in x if word not in (stop)])
    
print(book_summs.head())

Size of dataset before preprocessing: 16559
Size of dataset after removing missing genres: 12841
Size of dataset after removing missing titles: 12841
Size of dataset after removing missing summaries: 12841
Size of dataset after preprocessing: 12841
                            title  \
0                     Animal Farm   
1              A Clockwork Orange   
2                      The Plague   
4            A Fire Upon the Deep   
5  All Quiet on the Western Front   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   
5  {"/m/098tmk": "War novel", "/m/016lj8": "Roman...   

                                             summary          tokenized_title  \
0   Old Major, the old boar on the Manor Farm, ca...           [animal, farm]   
1   Alex, a teenager living i

In [5]:
# Format the genre field
formatted_genres = []
genre_dict = dict()
for g in book_summs['genre']:
    subg = []
    genre_dict = eval(g)
    for k in genre_dict.keys():
        subg.append(genre_dict[k])
    formatted_genres.append(subg)
book_summs['formatted_genre'] = formatted_genres
print(book_summs.head())

                            title  \
0                     Animal Farm   
1              A Clockwork Orange   
2                      The Plague   
4            A Fire Upon the Deep   
5  All Quiet on the Western Front   

                                               genre  \
0  {"/m/016lj8": "Roman \u00e0 clef", "/m/06nbt":...   
1  {"/m/06n90": "Science Fiction", "/m/0l67h": "N...   
2  {"/m/02m4t": "Existentialism", "/m/02xlf": "Fi...   
4  {"/m/03lrw": "Hard science fiction", "/m/06n90...   
5  {"/m/098tmk": "War novel", "/m/016lj8": "Roman...   

                                             summary          tokenized_title  \
0   Old Major, the old boar on the Manor Farm, ca...           [animal, farm]   
1   Alex, a teenager living in near-future Englan...      [clockwork, orange]   
2   The text of The Plague is divided into five p...                 [plague]   
4   The novel posits that space around the Milky ...       [fire, upon, deep]   
5   The book tells the story of Pau

## Extract data to use for training & testing model

In [6]:
# Function adapted from midterm source code (PP1)
def generate_vocab_map(df, cutoff=1):
    vocab = {"": 0, "UNK": 1}
    reversed_vocab = None
    
    # Iterate over the tokenized words in the dataset & construct a frequency map
    vocab_count = dict()
    uid = 2
    for i in df['tokenized_title'].tolist():
      # Iterate over the list of tokenized words from each summary
      for t in i:
        if t in vocab_count:
            vocab_count[t] += 1
        else:
            vocab_count[t] = 1
    for i in df['tokenized_summary'].tolist():
        # Iterate over the list of tokensize words from each title
        for t in i:
            if t in vocab_count:
                vocab_count[t] += 1
            else:
                vocab_count[t] = 1

    # Ignore all words under the cutoff, give all others a unique id
    for i in vocab_count.keys():
        if vocab_count[i] > cutoff:
            vocab[i] = uid
            uid += 1
    
    reversed_vocab = {v: k for k, v in vocab.items()}

    return vocab, reversed_vocab

def generate_genre_map(genres):
    genre_map = dict()
    uid = 0
    for g in genres:
        genre_map[g] = uid
        uid += 1
    return genre_map

In [7]:
genres = set()
for x in book_summs['formatted_genre']:
    genres.update(x)
print('# of unique genres:', len(genres))
print(genres)

# of unique genres: 227
{'Alternate history', 'Humour', 'Autobiographical novel', 'Anthology', 'Marketing', 'Speculative fiction', 'Psychology', 'Fantasy of manners', 'Social sciences', 'Picture book', 'Police procedural', 'Paranormal romance', 'Romantic comedy', 'Reference', 'Elizabethan romance', 'Supernatural', 'Regency romance', 'Existentialism', 'Comedy', 'Politics', 'Young adult literature', 'Wuxia', 'Youth', 'Invasion literature', 'Autobiographical comics', 'Lost World', 'Bildungsroman', 'Modernism', 'Future history', 'Political philosophy', 'Mathematics', 'Heroic fantasy', 'Pastiche', 'Ghost story', 'Absurdist fiction', 'Albino bias', 'New Weird', 'Fictional crossover', 'Biopunk', 'Serial', 'Post-holocaust', 'Edisonade', 'Religion', 'Scientific romance', 'Utopian fiction', 'American Gothic Fiction', 'Biography', 'Cookbook', 'Erotica', 'Ergodic literature', 'Superhero fiction', 'High fantasy', 'Space opera', 'Human extinction', 'Drama', 'Gay Themed', 'Comic science fiction', 'Ba

In [8]:
train_vocab, reverse_vocab = generate_vocab_map(book_summs)
genre_map = generate_genre_map(list(genres))

In [9]:
# We only care about tokenized and formatted data from the original dataset
book_summs = book_summs.sample(frac=1)
X = book_summs[['tokenized_title', 'tokenized_summary', 'formatted_genre']].copy()
print(X.head())
mapped_genres = []
for x in X['formatted_genre']:
    m = []
    for g in x:
        m.append(genre_map[g])
    mapped_genres.append(m)
X['formatted_genre'] = mapped_genres
print(X.head())

            tokenized_title  \
4503    [body, jonah, boyd]   
11515           [magicians]   
42        [heart, darkness]   
7838   [machine, 's, child]   
4392         [recognitions]   

                                       tokenized_summary  \
4503   [anne, finds, manuscript, schemes, boyd, order...   
11515  [main, character, ,, sir, charles, ravenstreet...   
42     ['heart, darkness, ', opens, first, person, na...   
7838   [end, life, world, come, ,, alec, checkerfield...   
4392   [story, loosely, follows, life, wyatt, gwyon, ...   

                              formatted_genre  
4503                                [Fiction]  
11515                   [Speculative fiction]  
42           [Fiction, Novella, Roman à clef]  
7838   [Science Fiction, Speculative fiction]  
4392                                [Fiction]  
            tokenized_title  \
4503    [body, jonah, boyd]   
11515           [magicians]   
42        [heart, darkness]   
7838   [machine, 's, child]   
4392     

In [10]:
# Split the data into training and testing data
from SummaryDataset import split_train_val_test
X_train, X_val, X_test = split_train_val_test(X)

## TODO: Review midterm PP1 for pytorch implementation ##

In [11]:
from SummaryDataset import SummaryDataset
from torch.utils.data import RandomSampler

train_dataset = SummaryDataset(train_vocab, X_train)
val_dataset = SummaryDataset(train_vocab, X_val)
test_dataset = SummaryDataset(train_vocab, X_test)

train_sampler = RandomSampler(train_dataset)
val_sampler = RandomSampler(val_dataset)
test_sampler = RandomSampler(test_dataset)

In [12]:
from torch.utils.data import DataLoader
from SummaryDataset import collate_fn
BATCH_SIZE = 16

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler, collate_fn=collate_fn)

In [13]:
from GenreClassifier import ClassificationModel

model = None
model = ClassificationModel(vocab_size=len(train_vocab.keys()), embedding_dim=300, hidden_dim=1, output_dim=len(genres), num_layers=1, bidirectional=True)

In [14]:
from torch.optim import AdamW

criterion, optimizer = None, None
optimizer = AdamW(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

In [15]:
from tqdm import tqdm
# returns the total loss calculated from criterion
def train_loop(model, criterion, iterator):
    model.train()
    total_loss = 0
    for x, y in tqdm(iterator):
        y_pred = model(x).round()
        loss = criterion(torch.flatten(y_pred),torch.tensor(y, dtype=torch.float))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
    return total_loss

# returns:
# - true: a Python boolean array of all the ground truth values
#         taken from the dataset iterator
# - pred: a Python boolean array of all model predictions.
def val_loop(model, criterion, iterator):
    true, pred = [], []
    for x, y in tqdm(iterator):
        for t in y:
            true.append(t)
        y_pred = model(x).round()
        for p in y_pred:
            pred.append(p)
    return true, pred

In [16]:
#from src.eval_utils import binary_macro_f1, accuracy
true, pred = val_loop(model, criterion, val_iterator)
#print(binary_macro_f1(true, pred)) # TODO: CHANGE METRIC CALCULATIONS
#print(accuracy(true, pred)) # TODO: CHANGE METRIC CALCULATIONS
print(pred)

  x_t = torch.tensor(x, dtype=torch.long)
  0%|                                                    | 0/81 [00:00<?, ?it/s]


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 227)