In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import spacy
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, AdamW

In [2]:
df=pd.read_csv('processed_data.csv')

#  Encoding the target labels (genres)

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df['genre_list_encoded'] = list(mlb.fit_transform(df['genre_list']))


In [4]:
df.head()

Unnamed: 0,original_title,genre_list,overview,genre_list_encoded
0,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...","[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,"[1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,Grumpier Old Men,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,"[1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']","Cheated on, mistreated and stepped on, the wom...","[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,Father of the Bride Part II,['Comedy'],Just when George Banks has recovered from his ...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Extracting named entities from the movie summaries

In [13]:
nlp = spacy.load('en_core_web_sm')

def extract_named_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

df['named_entities'] = df['overview'].apply(extract_named_entities)


# Embedding movie summaries using BERT

Now, we'll use BERT to create embeddings for the movie summaries. First, we'll import the necessary libraries and load the BERT model.

In [6]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('./temp-bert')

Some weights of the model checkpoint at ./temp-bert were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
littledf=df.iloc[:3].copy(deep=True)

In [18]:
littledf

Unnamed: 0,original_title,genre_list,overview,genre_list_encoded
0,Toy Story,"['Animation', 'Comedy', 'Family']","Led by Woody, Andy's toys live happily in his ...","[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",When siblings Judy and Peter discover an encha...,"[1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,Grumpier Old Men,"['Romance', 'Comedy']",A family wedding reignites the ancient feud be...,"[1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


### the loop method (don't run it!)

In [19]:
import torch

def get_bert_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
    return outputs[1].numpy()

littledf['overview_embedding'] = littledf['overview'].apply(get_bert_embedding)


In [23]:
def get_bert_embeddings(texts, tokenizer, model, batch_size=64, device='cuda'):
    model.to(device)
    model.eval()

    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        input_ids = tokenizer.batch_encode_plus(
            batch_texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = model(**input_ids)
        
        batch_embeddings = outputs[1].cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

# Get embeddings for the entire dataset
overview_embeddings = get_bert_embeddings(littledf['overview'].tolist(), tokenizer, model)
littledf['overview_embedding'] = list(overview_embeddings)


AssertionError: Torch not compiled with CUDA enabled

In [21]:
littledf['overview_embedding'].values[0].shape

(1, 768)

# Fine-tuning the BERT model
First, we need to split the data into training and validation sets.

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


Next, we'll create the PyTorch Dataset and DataLoader for both the training and validation sets.

In [None]:
from transformers import BertForSequenceClassification, AdamW
from torch.optim import lr_scheduler
import torch.nn as nn
import numpy as np

# Instantiate the fine-tuned BERT model
num_labels = len(mlb.classes_)
model = BertForSequenceClassification.from_pretrained('./temp-bert/pytorch_model.bin', num_labels=num_labels)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer, criterion, and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
criterion = nn.BCEWithLogitsLoss()
scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_dataloader)}")


# Making predictions

After training the model, we'll create a function to make predictions on new summaries.

In [None]:
def predict_genre(text, model, tokenizer, threshold=0.5):
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    predictions = (probabilities > threshold).cpu().numpy()
    return mlb.inverse_transform(predictions)


In [None]:
new_summary = "A group of intergalactic criminals is forced to work together to stop a fanatical warrior from taking control of the universe."
predicted_genres = predict_genre(new_summary, model, tokenizer)
print(predicted_genres)


# in a nutshell: how is the data vectorized?

## Vectorization of input features (X)

The input features are the movie overviews. We use the BERT tokenizer to tokenize the movie overviews and then create an embedding for each overview. The BERT tokenizer converts each overview into a sequence of tokens, which are then represented by their corresponding IDs. The resulting vector has a fixed length (in this case, 512) and is padded or truncated as needed.

Here's how the tokenization and embedding process works:

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_overview(overview):
    return tokenizer.encode_plus(
        overview,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

overview = "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene."
tokenized_overview = tokenize_overview(overview)

print(f"Tokenized overview: {tokenized_overview['input_ids']}")
print(f"Attention mask: {tokenized_overview['attention_mask']}")


## Vectorization of target labels (y)
The target labels are the movie genres. Since a movie can have multiple genres, we use one-hot encoding to represent the genre labels. One-hot encoding creates a binary vector for each genre, where the element corresponding to the genre's index is set to 1, and all other elements are set to 0.

Here's how the one-hot encoding process works:

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

# Let's assume we have the following genre list
genre_list = [['Animation', 'Comedy', 'Family'],
              ['Adventure', 'Fantasy', 'Action'],
              ['Science Fiction', 'Action', 'Thriller']]

# Fit the MultiLabelBinarizer to the genre list and transform it
genre_encoded = mlb.fit_transform(genre_list)

print(f"Encoded genre list: {genre_encoded}")
print(f"Genre classes: {mlb.classes_}")


# probabilistics recommender with a bunching parameter

In [None]:
import heapq
from scipy.spatial.distance import cosine

In [None]:
def compute_similarity(overview1, overview2):
    tokenized_overview1 = tokenize_overview(overview1)['input_ids'].to(device)
    tokenized_overview2 = tokenize_overview(overview2)['input_ids'].to(device)
    
    with torch.no_grad():
        embedding1 = model.bert(tokenized_overview1)[1].cpu().numpy()
        embedding2 = model.bert(tokenized_overview2)[1].cpu().numpy()
        
    return 1 - cosine(embedding1, embedding2)


In [None]:
def recommend_movies(movie_names, bunching, df):
    movie_indices = df[df['original_title'].isin(movie_names)].index.tolist()
    
    if len(movie_indices) != len(movie_names):
        print("Some movies were not found in the dataset.")
        return
    
    similarity_scores = []
    
    for index, row in df.iterrows():
        if index not in movie_indices:
            total_similarity = sum([compute_similarity(row['overview'], df.loc[i]['overview']) for i in movie_indices])
            adjusted_similarity = total_similarity / len(movie_indices)
            similarity_scores.append((index, adjusted_similarity))
    
    if bunching > 0:
        # Recommend similar movies
        top_indices = heapq.nlargest(5, similarity_scores, key=lambda x: x[1])
    else:
        # Recommend dissimilar movies
        top_indices = heapq.nsmallest(5, similarity_scores, key=lambda x: x[1])
    
    recommended_movies = df.loc[[index for index, _ in top_indices]]
    
    return recommended_movies


In [None]:
# Example
movie_names = ['Toy Story', 'Jumanji', 'Heat', 'GoldenEye', 'The American President']
bunching = 0.5

recommended_movies = recommend_movies(movie_names, bunching, df)
print(recommended_movies[['original_title', 'overview', 'genre_list']])
