In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os

ROOT_PATH = "/content/drive/MyDrive/ADA/"
print(os.listdir(ROOT_PATH)) # Check the content of the path
os.chdir(ROOT_PATH) # cd into directory
print(os.listdir(".")) # Check the content of current folder

['book_film_summaries.csv', '.ipynb_checkpoints', 'bookfilm_summaries_with_similarity.csv']
['book_film_summaries.csv', '.ipynb_checkpoints', 'bookfilm_summaries_with_similarity.csv']


In [4]:
!pip install langchain-core langchain-community -U sentence_transformers



In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import torch

In [6]:
class CustomEmbeddingsChunking(Embeddings):
    def __init__(self, model):
        self.model = model
        self.max_length = model.max_seq_length

    def chunk_text(self, text, max_tokens, overlap=50):
      words = text.split()
      chunks = []
      start = 0
      while start < len(words):
          end = min(start + max_tokens, len(words))
          chunk = ' '.join(words[start:end])
          chunks.append(chunk)
          if end == len(words):
              break
          start = end - overlap  # Move start forward with overlap
      return chunks

    def embed_documents(self, texts, batch_size=32):
        ''' Embed a list of texts and return the embeddings '''
        embeddings = []
        for i in range(0, len(texts)):

            batch_texts = texts[i]
            doc_embedding = self.model.encode(batch_texts, batch_size=batch_size, show_progress_bar=False)
            embeddings.append(doc_embedding)

        return embeddings

    def embed_query(self, text):
        return self.model.encode([text])[0]

    def embed_text_pair(self, text_1, text_2):
        ''' Embed two texts and return their embeddings '''
        chunks_1 = self.chunk_text(text_1, self.max_length)
        chunks_2 = self.chunk_text(text_2, self.max_length)

        return np.mean(self.embed_documents(chunks_1), axis=0), np.mean(self.embed_documents(chunks_2), axis=0)

## Load Model

In [7]:
model = SentenceTransformer('all-mpnet-base-v2')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


from transformers import pipeline

sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english", device=device)



## Load Data

In [8]:
summaries_df = pd.read_csv('./book_film_summaries.csv')
summaries_df = summaries_df.drop('Unnamed: 0', axis=1)


summaries_df.head()

Unnamed: 0,movie_id,book_title,movie_name,book_summary,film_summary
0,196176,The Siege of Trencher's Farm,Straw Dogs,"George Magruder, an American professor of Eng...","David Sumner , a timid American mathematician,..."
1,6703617,Journey to the West,The Forbidden Kingdom,The novel has 100 chapters. These can be divi...,"In this film, which is based loosely on the an..."
2,24416479,The Beast Master,Beastmaster 2: Through the Portal of Time,"It tells the story of Hosteen Storm, an ex-so...","Dar, the Beastmaster , is back and now he has ..."
3,9384481,The Blessing,Count Your Blessings,It is set in the post-war World War II period...,While visiting Grace Allingham in wartime Lond...
4,22224559,The Last Song,The Last Song,Veronica “Ronnie” Miller’s life was turned up...,"At seventeen, Veronica ""Ronnie"" Miller remain..."


## Run NLP Analysis

In [9]:
def cosine_similarity_from_text_pair(text_1, text_2):
    embedding_1, embedding_2 = embeddings.embed_text_pair(text_1, text_2)
    return cosine_similarity(embedding_1.reshape(1, -1), embedding_2.reshape(1, -1))[0][0]

def sentiment_of_text(text):
    # Chunk the text if necessary
    chunks = embeddings.chunk_text(text=text, max_tokens=300)

    chunk_scores = []
    for chunk in chunks:
        tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}

        result = sentiment_analysis(chunk, **tokenizer_kwargs)[0]  # Get the result for the chunk
        score = result['score'] if result['label'] == 'POSITIVE' else -result['score']
        chunk_scores.append(score)

    # Average sentiment score across chunks
    overall_score = sum(chunk_scores) / len(chunk_scores)
    overall_sentiment = "positive" if overall_score > 0 else "negative" if overall_score < 0 else "neutral"

    return overall_sentiment, overall_score

In [10]:
embeddings = CustomEmbeddingsChunking(model)

In [11]:
tqdm.pandas()

summaries_df['similarity'] = summaries_df.progress_apply(lambda row: cosine_similarity_from_text_pair(row['film_summary'], row['book_summary']), axis=1)
summaries_df.head()

100%|██████████| 840/840 [02:20<00:00,  5.99it/s]


Unnamed: 0,movie_id,book_title,movie_name,book_summary,film_summary,similarity
0,196176,The Siege of Trencher's Farm,Straw Dogs,"George Magruder, an American professor of Eng...","David Sumner , a timid American mathematician,...",0.451876
1,6703617,Journey to the West,The Forbidden Kingdom,The novel has 100 chapters. These can be divi...,"In this film, which is based loosely on the an...",0.559595
2,24416479,The Beast Master,Beastmaster 2: Through the Portal of Time,"It tells the story of Hosteen Storm, an ex-so...","Dar, the Beastmaster , is back and now he has ...",0.344548
3,9384481,The Blessing,Count Your Blessings,It is set in the post-war World War II period...,While visiting Grace Allingham in wartime Lond...,0.669707
4,22224559,The Last Song,The Last Song,Veronica “Ronnie” Miller’s life was turned up...,"At seventeen, Veronica ""Ronnie"" Miller remain...",0.852735


In [12]:
summaries_df['film_sentiment'], summaries_df['film_sentiment_score'] = zip(
    *summaries_df.progress_apply(lambda row: sentiment_of_text(row['film_summary']), axis=1)
)

display(summaries_df.head())
summaries_df.to_csv('bookfilm_summaries_with_similarity_and_film_sentiment.csv', index=False)

  0%|          | 3/840 [00:00<02:54,  4.81it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 840/840 [02:57<00:00,  4.73it/s]


Unnamed: 0,movie_id,book_title,movie_name,book_summary,film_summary,similarity,film_sentiment,film_sentiment_score
0,196176,The Siege of Trencher's Farm,Straw Dogs,"George Magruder, an American professor of Eng...","David Sumner , a timid American mathematician,...",0.451876,positive,0.333608
1,6703617,Journey to the West,The Forbidden Kingdom,The novel has 100 chapters. These can be divi...,"In this film, which is based loosely on the an...",0.559595,positive,0.995482
2,24416479,The Beast Master,Beastmaster 2: Through the Portal of Time,"It tells the story of Hosteen Storm, an ex-so...","Dar, the Beastmaster , is back and now he has ...",0.344548,positive,0.997562
3,9384481,The Blessing,Count Your Blessings,It is set in the post-war World War II period...,While visiting Grace Allingham in wartime Lond...,0.669707,positive,0.99607
4,22224559,The Last Song,The Last Song,Veronica “Ronnie” Miller’s life was turned up...,"At seventeen, Veronica ""Ronnie"" Miller remain...",0.852735,positive,0.978184


In [13]:
summaries_df['book_sentiment'], summaries_df['book_sentiment_score'] = zip(
    *summaries_df.progress_apply(lambda row: sentiment_of_text(row['book_summary']), axis=1)
)

summaries_df.head()

100%|██████████| 840/840 [04:08<00:00,  3.38it/s]


Unnamed: 0,movie_id,book_title,movie_name,book_summary,film_summary,similarity,film_sentiment,film_sentiment_score,book_sentiment,book_sentiment_score
0,196176,The Siege of Trencher's Farm,Straw Dogs,"George Magruder, an American professor of Eng...","David Sumner , a timid American mathematician,...",0.451876,positive,0.333608,positive,0.99448
1,6703617,Journey to the West,The Forbidden Kingdom,The novel has 100 chapters. These can be divi...,"In this film, which is based loosely on the an...",0.559595,positive,0.995482,positive,0.995404
2,24416479,The Beast Master,Beastmaster 2: Through the Portal of Time,"It tells the story of Hosteen Storm, an ex-so...","Dar, the Beastmaster , is back and now he has ...",0.344548,positive,0.997562,positive,0.998812
3,9384481,The Blessing,Count Your Blessings,It is set in the post-war World War II period...,While visiting Grace Allingham in wartime Lond...,0.669707,positive,0.99607,positive,0.996607
4,22224559,The Last Song,The Last Song,Veronica “Ronnie” Miller’s life was turned up...,"At seventeen, Veronica ""Ronnie"" Miller remain...",0.852735,positive,0.978184,positive,0.998164


In [14]:
summaries_df.to_csv('bookfilm_summaries_with_similarity_and_sentiment.csv', index=False)

In [15]:
film_lengths = summaries_df['film_summary'].apply(lambda film: len(film.split())).tolist()

# Calculate the lengths of book summaries
book_lengths = summaries_df['book_summary'].apply(lambda book: len(book.split())).tolist()


print(max(film_lengths))
print(max(book_lengths))

2484
4719


In [18]:
summaries_df.head(50)

Unnamed: 0,movie_id,book_title,movie_name,book_summary,film_summary,similarity,film_sentiment,film_sentiment_score,book_sentiment,book_sentiment_score
0,196176,The Siege of Trencher's Farm,Straw Dogs,"George Magruder, an American professor of Eng...","David Sumner , a timid American mathematician,...",0.451876,positive,0.333608,positive,0.99448
1,6703617,Journey to the West,The Forbidden Kingdom,The novel has 100 chapters. These can be divi...,"In this film, which is based loosely on the an...",0.559595,positive,0.995482,positive,0.995404
2,24416479,The Beast Master,Beastmaster 2: Through the Portal of Time,"It tells the story of Hosteen Storm, an ex-so...","Dar, the Beastmaster , is back and now he has ...",0.344548,positive,0.997562,positive,0.998812
3,9384481,The Blessing,Count Your Blessings,It is set in the post-war World War II period...,While visiting Grace Allingham in wartime Lond...,0.669707,positive,0.99607,positive,0.996607
4,22224559,The Last Song,The Last Song,Veronica “Ronnie” Miller’s life was turned up...,"At seventeen, Veronica ""Ronnie"" Miller remain...",0.852735,positive,0.978184,positive,0.998164
5,1231856,For Love of the Game,For Love of the Game,"On the second to last day of the season, Chap...",The Detroit Tigers travel to New York to play ...,0.710734,positive,0.995503,positive,0.997806
6,1273569,The Scarlet Pimpernel,The Elusive Pimpernel,"The Scarlet Pimpernel is set in 1792, during ...","During the French Revolution, the Scarlet Pimp...",0.854241,positive,0.995429,positive,0.994325
7,23470411,Nevada,Nevada,"A young boy, Meade Slaughter, works along wit...",A feared gunfighter named Nevada breaks his f...,0.586853,positive,0.993382,positive,0.336075
8,23470411,Nevada,Nevada,"Ben Ide, restless with the rancher life, move...",A feared gunfighter named Nevada breaks his f...,0.746282,positive,0.993382,negative,-0.998015
9,9088886,The Queen of the Damned,Queen of the Damned,Part One follows several different people ove...,Vampire Lestat is awakened from decades of slu...,0.689192,positive,0.994485,positive,0.994574


In [16]:
# from transformers import pipeline
# sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
# print(sentiment_analysis("I love this!"))

In [17]:
# inputs = sentiment_tokenizer("I love this!", return_tensors="pt", truncation=True, padding=True).to(device)

# with torch.no_grad():
#     outputs = sentiment_model(**inputs)
#     logits = outputs.logits

# # Step 3: Convert logits to probabilities
# probs = torch.softmax(logits, dim=-1)
# print(probs)