In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch

In [14]:
import random

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [15]:
data = pd.read_csv("data.csv")

print(data.shape)
data.head()

(4657, 4)


Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...


In [16]:
data.drop(columns = ["index"], inplace = True)

In [17]:
num_labels = len(data.genre.unique())
print("num labels: ", num_labels)
genre2id = {genre: i for i, genre in enumerate(data.genre.unique())}
id2genre = {i: genre for i, genre in enumerate(data.genre.unique())}

num labels:  10


In [18]:
import re

def clean_text(text):
    text = re.sub('[^a-zA-Z0-9\.\,\?\!]', ' ', str(text).lower()) # remove all except lowercase, uppercase, digits, punctuation
    text = re.sub('\[.*?\]', '', text) # remove any text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove any links present 
    text = re.sub('\n', ' ', text) # remove the next line character
    text = re.sub('\w*\d\w*', '', text) # remove the words contaitning numbers
    text = re.sub('\s+', ' ', text) # remove extra spaces
    
    return text

In [19]:
data["summary"] = data["summary"].apply(clean_text)
data.head()

Unnamed: 0,title,genre,summary
0,Drowned Wednesday,fantasy,drowned wednesday is the first trustee among ...
1,The Lost Hero,fantasy,"as the book opens, jason awakens on a school ..."
2,The Eyes of the Overworld,fantasy,cugel is easily persuaded by the merchant fia...
3,Magic's Promise,fantasy,the book opens with herald mage vanyel return...
4,Taran Wanderer,fantasy,taran and gurgi have returned to caer dallben...


In [20]:
data["genre_id"] = data["genre"].apply(lambda a: genre2id[a])

In [21]:
data["genre"].value_counts()

thriller      1023
fantasy        876
science        647
history        600
horror         600
crime          500
romance        111
psychology     100
sports         100
travel         100
Name: genre, dtype: int64

In [22]:
By now, data has been cleaned. Next step is to tokenize, train, and test. The first step is to use TextRank from Gensim to rank the first few sentences, and then we will train on BERT and RoBERTA to see its' performance.
Text Rank by Spacy:

Note that this could be skipped.
fantasy_summaries = data[data["genre_id"] == 0]
science_summaries = data[data["genre_id"] == 1]
crime_summaries = data[data["genre_id"] == 2]
history_summaries = data[data["genre_id"] == 3]
horror_summaries = data[data["genre_id"] == 4]
thriller_summaries = data[data["genre_id"] == 5]
psychology_summaries = data[data["genre_id"] == 6]
romance_summaries = data[data["genre_id"] == 7]
sports_summaries = data[data["genre_id"] == 8]
travel_summaries = data[data["genre_id"] == 9]

In [51]:
from sklearn.utils import resample

fantasy_downsample = resample(fantasy_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

science_downsample = resample(science_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

crime_downsample = resample(crime_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

history_downsample = resample(history_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

horror_downsample = resample(horror_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

thriller_downsample = resample(thriller_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

psychology_downsample = resample(psychology_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

romance_downsample = resample(romance_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

sports_downsample = resample(sports_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

travel_downsample = resample(travel_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

In [82]:
train = pd.concat([fantasy_downsample, science_downsample, crime_downsample, history_downsample, horror_downsample, thriller_downsample, psychology_downsample, romance_downsample, sports_downsample, travel_downsample])

In [83]:
train["genre"].value_counts()

fantasy       300
science       300
crime         300
history       300
horror        300
thriller      300
psychology     80
romance        80
sports         80
travel         80
Name: genre, dtype: int64

In [84]:
test = data.loc[~data.index.isin(train.index)]

In [85]:
test["genre"].value_counts()

thriller      723
fantasy       576
science       347
history       300
horror        300
crime         200
romance        31
psychology     20
sports         20
travel         20
Name: genre, dtype: int64

In [86]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [87]:
train.head()

Unnamed: 0,title,genre,summary,genre_id
365,The Forest House,fantasy,"in the early days of the conquest, when the r...",0
4436,Heartless,fantasy,long before she was the terror of wonderland t...,0
4609,A Hat Full of Sky,fantasy,we see you. now we are you. no real witch woul...,0
4473,Going Postal,fantasy,arch swindler moist van lipwig never believed ...,0
213,Tithe : A Modern Faerie Tale,fantasy,tithe follows the story of sixteen year old a...,0


In [88]:
train = train.drop('title',axis=1)
train = train.rename(columns={'genre': 'label', 'summary': 'text', 'genre_id': 'label_id'})
test = test.drop('title',axis=1)
test = test.rename(columns={'genre': 'label', 'summary': 'text', 'genre_id': 'label_id'})

In [89]:
#Write data to files:
train.to_csv('data/ds1_train.csv', index=False)  # Specify the file path and name
test.to_csv('data/ds1_test.csv', index=False)  # Specify the file path and name

In [90]:
train_data = pd.read_csv("data/ds1_train.csv")
train_data.head()

Unnamed: 0,label,text,label_id
0,fantasy,"in the early days of the conquest, when the r...",0
1,fantasy,long before she was the terror of wonderland t...,0
2,fantasy,we see you. now we are you. no real witch woul...,0
3,fantasy,arch swindler moist van lipwig never believed ...,0
4,fantasy,tithe follows the story of sixteen year old a...,0
