In [1]:
import numpy as np
import pandas as pd
import re
import random
import torch
import spacy
import pytextrank
import re
import random
import string
import networkx as nx
import nltk

from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import spatial
from gensim.models import Word2Vec
from gensim.summarization import summarize
from transformers import LongformerTokenizer

In [2]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [3]:
# Read data from a CSV file and print its shape and first few rows
data = pd.read_csv("data.csv")

print(data.shape)
data.head()

(4657, 4)


Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...


In [4]:
# Drop the "index" column from the DataFrame
data.drop(columns = ["index"], inplace = True)

In [5]:
# Count the number of unique labels in the 'genre' column
num_labels = len(data.genre.unique())
print("num labels: ", num_labels)
# Create mappings between genre labels and their corresponding IDs
genre2id = {genre: i for i, genre in enumerate(data.genre.unique())}
id2genre = {i: genre for i, genre in enumerate(data.genre.unique())}

num labels:  10


In [6]:
def clean_text(text):

    """
    Clean the input text by removing unwanted characters, links, numbers, etc.

    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    text = re.sub('[^a-zA-Z0-9\.\,\?\!]', ' ', str(text).lower()) # remove all except lowercase, uppercase, digits, punctuation
    text = re.sub('\[.*?\]', '', text) # remove any text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove any links present 
    text = re.sub('\n', ' ', text) # remove the next line character
    text = re.sub('\w*\d\w*', '', text) # remove the words contaitning numbers
    text = re.sub('\s+', ' ', text) # remove extra spaces
    
    return text

In [7]:
data["summary"] = data["summary"].apply(clean_text)
data.head()

Unnamed: 0,title,genre,summary
0,Drowned Wednesday,fantasy,drowned wednesday is the first trustee among ...
1,The Lost Hero,fantasy,"as the book opens, jason awakens on a school ..."
2,The Eyes of the Overworld,fantasy,cugel is easily persuaded by the merchant fia...
3,Magic's Promise,fantasy,the book opens with herald mage vanyel return...
4,Taran Wanderer,fantasy,taran and gurgi have returned to caer dallben...


In [8]:
data["genre"].value_counts()

thriller      1023
fantasy        876
science        647
history        600
horror         600
crime          500
romance        111
psychology     100
sports         100
travel         100
Name: genre, dtype: int64

In [9]:
def text_rank_top_four(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    
    # If the text has no sentences, return an empty string
    if len(sentences) == 0:
        return ''
    
    # If the text has less than four sentences, return the original text
    if len(sentences) < 4:
        return text
    
    # Use gensim's summarize function, which implements TextRank. We ask for a summary that is 20% of the original length.
    summary = summarize(text, ratio=0.2)
    
    # Split the summary into sentences
    summary_sentences = summary.split('\n')
    
    # If the summary has no sentences, return an empty string
    if len(summary_sentences) == 0:
        return ''
    
    # Otherwise, return the first four sentences of the summary
    return ' '.join(summary_sentences[:4])


i=2
print(data["summary"][i])
print("\n-----------------------after TextRank---------------------\n")
print(text_rank_top_four(data["summary"][i]))

 cugel is easily persuaded by the merchant fianosther to attempt the burglary of the manse of iucounu the laughing magician. trapped and caught, he agrees that in exchange for his freedom he will undertake the recovery of a small hemisphere of violet glass, an eye of the overworld, to match one already in the wizard s possession. a small sentient alien entity of barbs and hooks, named firx, is attached to his liver to encourage his unremitting loyalty, zeal and singleness of purpose, and iucounu uses a spell to transport cugel via flying demon to the remote land of cutz. there, cugel finds two villages, one occupied by wearers of the violet lenses, the other by peasants who work on behalf of the lens wearers, in hopes of being promoted to their ranks. the lenses cause their wearers to see, not their squalid surroundings, but the overworld, a vastly superior version of reality where a hut is a palace, gruel is a magnificent feast, etc. seeing the world through rose colored glasses on a 

In [10]:
#Now we keep the first four sentences for each summary.

try:
    data["summary"] = data["summary"].apply(text_rank_top_four)
except Exception as e:
    print("An error occurred while applying text_rank_top_four:", str(e))


# Now all the data ready, need to train and test.

In [21]:
from sklearn.model_selection import train_test_split

def split_data(df, train_size=0.7, val_size=0.2, test_size=0.1, random_state=seed_val):
    # Split the data into train and remaining data
    train_data, remaining_data = train_test_split(df, train_size=train_size, random_state=random_state)

    # Calculate the remaining size after the train split
    remaining_size = val_size + test_size

    # Split the remaining data into validation and test sets
    val_data, test_data = train_test_split(remaining_data, test_size=remaining_size, random_state=random_state)

    return train_data, val_data, test_data


In [12]:
#We drop the data title, and only train it based on summary:
data = data.drop("title",axis=1)
data.head()

Unnamed: 0,genre,summary
0,fantasy,the book begins when leaf is visiting arthur a...
1,fantasy,"annabeth, seeking percy, was told in a vision ..."
2,fantasy,"trapped and caught, he agrees that in exchange..."
3,fantasy,the book opens with herald mage vanyel returni...
4,fantasy,taran and gurgi have returned to caer dallben ...


In [13]:
#First step is to tokenize.

# Create a tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

# Tokenize the text
def tokenize_text(df, max_length=512):
    # Tokenize the 'summary' column of the DataFrame using the tokenizer's batch_encode_plus method
    # df['summary'].tolist() converts the 'summary' column into a list of strings
    # max_length specifies the maximum length of the tokenized sequences
    # padding='max_length' pads the tokenized sequences to have a length of max_length
    # truncation=True truncates the tokenized sequences if they exceed the max_length
    return tokenizer.batch_encode_plus(
        df['summary'].tolist(), 
        max_length=max_length, 
        padding='max_length', 
        truncation=True
    )

tokenized_texts = tokenize_text(data)


In [29]:
#Then prepare the dataset and dataloader
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import torch

#convert genre labels to numerical values. 
le = LabelEncoder()
data['genre'] = le.fit_transform(data['genre'])

# Splitting the dataset into train and remaining (val + test)
train_data, remaining_data = train_test_split(data, test_size=0.3, stratify=data['genre'], random_state=seed_val)

# Splitting the remaining_data into validation and test
val_data, test_data = train_test_split(remaining_data, test_size=0.333, stratify=remaining_data['genre'], random_state=seed_val)


class BookDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = BookDataset(tokenized_texts, data['genre'].values)

# Create the dataloader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
test

# Initialize the model and train the model

In [30]:
# Initialize the model:

from transformers import LongformerForSequenceClassification

model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=len(data['genre'].unique()))


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight'

In [31]:
from torch.optim import Adam

# Initialize the optimizer
optimizer = Adam(model.parameters(), lr=1e-5)

# Move the model to the GPU
model.to('cuda')

# Training loop
for epoch in range(5):  # Number of epochs
    for batch in dataloader:
        # Move batch tensors to the same device as the model
        batch = {k: v.to('cuda') for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        # Optimize the model parameters
        optimizer.step()
        optimizer.zero_grad()


# Test the model

In [36]:
#Prepare the new data: Just as you did with the training data, 
#you need to tokenize and format the new data so it's suitable for the model. 
#If you have a new book summary, you can prepare it like this:


text = "This is a new book summary..."
inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
inputs = inputs.to('cuda')  # move inputs to GPU


#Make a prediction: Pass the prepared data to the model to get the predicted genre.
outputs = model(**inputs)

#Interpret the prediction: The model will output the logits for each genre. 
#You can convert these logits to probabilities using the softmax function, 
#and then get the genre with the highest probability.

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_genre = probabilities.argmax().item()

#Remember that predicted_genre will be an integer representing the predicted genre. 
#If you've used a LabelEncoder to encode your genres, you can get the actual genre label like this:
predicted_genre_label = le.inverse_transform([predicted_genre])



# Save all the pretrained parameters

In [38]:
model.save_pretrained("LongFormer_parameters")

#Load model by using below:
#model = LongformerForSequenceClassification.from_pretrained("LongFormer_parameters")



# Now we could evaluate by using Confusion matrix

In [None]:
#Collect the model's predictions and the true labels: 
from torch.nn.functional import softmax
import numpy as np

all_preds = []
all_true = []

model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = batch['input_ids'].to('cuda'), batch['labels'].to('cuda')
        outputs = model(inputs)

        probabilities = softmax(outputs.logits, dim=-1)
        predictions = torch.argmax(probabilities, dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        all_true.extend(labels.cpu().numpy())


In [None]:
#Create the confusion matrix:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(all_true, all_preds)

#Plot it
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [14]:
def preprocess_text(text):
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Initialize list to hold preprocessed sentences
    preprocessed_sentences = []

    # Iterate over each sentence
    for sentence in sentences:
        # Make lower case
        sentence = sentence.lower()
        
        # Tokenize into words
        tokens = word_tokenize(sentence)
        
        # Remove stopwords and lemmatize the words
        stop_words = set(stopwords.words('english'))
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        
        # Append to the list of preprocessed sentences
        preprocessed_sentences.append(tokens)

    return sentences,preprocessed_sentences

In [29]:
def summarize_text(text):
    sentences = preprocess_text(text)[0]
    sentence_tokens = preprocess_text(text)[1]
    
    # Calculating word embedding
    w2v = Word2Vec(sentence_tokens, min_count=1)
    sentence_embeddings = [[w2v.wv[word][0] for word in words] for words in sentence_tokens]

    # Finding the maximum length of a sentence in the text
    max_len = max(len(tokens) for tokens in sentence_tokens)

    # Padding sentence embeddings with zeros to match the maximum sentence length
    sentence_embeddings = [np.pad(embedding, (0, max_len - len(embedding)), 'constant') for embedding in sentence_embeddings]

    # Calculating the similarity matrix
    similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
    for i, row_embedding in enumerate(sentence_embeddings):
        for j, column_embedding in enumerate(sentence_embeddings):
            similarity_matrix[i][j] = 1 - spatial.distance.cosine(row_embedding, column_embedding)

    # Implementing PageRank
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph,max_iter=5000)

    # Sorting sentences
    top_sentences_with_scores = sorted([(sentence, scores[index]) for index, sentence in enumerate(sentences)], key=lambda x: x[1], reverse=True)

    # Get top 4 sentences (maintaining the original order)
    top_sentences = [sentence for sentence, score in top_sentences_with_scores[:4]]
    top_sentences.sort(key = lambda x: sentences.index(x))

    # Joining sentences
    summary = ' '.join(top_sentences)

    return top_sentences

        
# Test the function
summarize_text(data['summary'][0])

NameError: name 'preprocess_text' is not defined

In [124]:
data = data_cp

In [125]:
len(sent_tokenize(data['summary'][5]))

1

In [117]:
for i in range(len(data)):
    try:
        # Apply the summarize_text function and store the result in the new column
        data.loc[i, 'summary'] = summarize_text(data.loc[i, 'summary'])
    except Exception as e:
        # If an error occurs, print the error message and skip this row
        print(f"Error in row {i}: {e}")


Error in row 262: you must first build vocabulary before training the model


In [None]:
def preprocess_sentences(sentences):
    # Lowercase and remove punctuation
    sentences_clean = [re.sub(r'[^\w\s]', '', sentence.lower()) for sentence in sentences]

    # Load list of stopwords
    stop_words = stopwords.words('english')

    # Tokenize sentences and remove stopwords
    sentence_tokens = [[word for word in sentence.split(' ') if word not in stop_words] for sentence in sentences_clean]

    return sentence_tokens
data['summary'] = data['summary'].apply(preprocess_sentences)
print(data['summary'][0])

#Tokenizing every paragraph into a list of sentences
data['summary'] = data['summary'].apply(sent_tokenize)


In [13]:
data['summary'] = data['summary'].apply(gensim_summarizer)

' drowned wednesday is the first trustee among the morrow days who is on arthur s side and wishes the will to be fulfilled. she appears as a leviathan whale and suffers from gluttony. the book begins when leaf is visiting arthur and they are discussing the invitation that drowned wednesday sent him. arthur had been admitted to hospital because of the damage done to his leg when he attempted to enter tuesday s treasure tower. suddenly, the hospital room becomes flooded with water as the two are transported to the border sea of the house. leaf is snatched away by a large ship with green sails, known as the flying mantis, while arthur remains in his bed. when the medallion given him by the immortal called the mariner apparently fails to summon help, arthur is without hope. eventually, a buoy marking the pirate elishar feverfew s treasure floats toward him. as soon as arthur opens it, his hand is marked with a bloody red colour. arthur now has the red hand, by which feverfew marks whoever 

Below are all previous code.

In [None]:
text = data['summary'][0]
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

doc = nlp(text)
# examine the top-ranked phrases in the document
for phrase in doc._.phrases:
    print(phrase.text)
    print(phrase.rank, phrase.count)
    print(phrase.chunks)

In [24]:
fantasy_summaries = data[data["genre"] == id2genre[0]]
science_summaries = data[data["genre"] == id2genre[1]]
crime_summaries = data[data["genre"] == id2genre[2]]
history_summaries = data[data["genre"] == id2genre[3]]
horror_summaries = data[data["genre"] == id2genre[4]]
thriller_summaries = data[data["genre"] == id2genre[5]]
psychology_summaries = data[data["genre"] == id2genre[6]]
romance_summaries = data[data["genre"] == id2genre[7]]
sports_summaries = data[data["genre"] == id2genre[8]]
travel_summaries = data[data["genre"] == id2genre[9]]

In [32]:
# Downsample the fantasy_summaries dataset
fantasy_downsample = resample(fantasy_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

science_downsample = resample(science_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

crime_downsample = resample(crime_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

history_downsample = resample(history_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

horror_downsample = resample(horror_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

thriller_downsample = resample(thriller_summaries,
                              replace=False,
                              n_samples=300,
                              random_state=42)

psychology_downsample = resample(psychology_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

romance_downsample = resample(romance_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

sports_downsample = resample(sports_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

travel_downsample = resample(travel_summaries,
                              replace=False,
                              n_samples=80,
                              random_state=42)

In [26]:
## Concatenate it VERTICALLY
train = pd.concat([fantasy_downsample, science_downsample, crime_downsample, history_downsample, horror_downsample, thriller_downsample, psychology_downsample, romance_downsample, sports_downsample, travel_downsample])

In [27]:
train["genre"].value_counts()

genre
fantasy       300
science       300
crime         300
history       300
horror        300
thriller      300
psychology     80
romance        80
sports         80
travel         80
Name: count, dtype: int64

In [28]:
test = data.loc[~data.index.isin(train.index)]

In [29]:
test["genre"].value_counts()

genre
thriller      723
fantasy       576
science       347
history       300
horror        300
crime         200
romance        31
psychology     20
sports         20
travel         20
Name: count, dtype: int64

In [3]:
# If there's a GPU available...
import torch
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070
