# Load the data and pre-trained BERT tokenizer
 This loads the movie data from a CSV file and pre-trained BERT tokenizer from the bert-base-uncased model.

In [2]:
import pandas as pd
from transformers import BertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv('../../data/processed/processed_data.csv')

In [3]:
mlb = MultiLabelBinarizer()
genre_binary = mlb.fit_transform(df['genre_list'])
print('Shape of genre_binary array:', genre_binary.shape)

Shape of genre_binary array: (44372, 43)


In [4]:
type_columns = [f'TYPE_{type_name}' for type_name in mlb.classes_]

In [5]:
# Create a new DataFrame by concatenating the original grouped data with the binary matrix as columns
final_data = pd.concat([df, pd.DataFrame(genre_binary, columns=type_columns)], axis=1)

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the saved fine-tuned BERT model
This loads the saved fine-tuned BERT model from file.

In [9]:
from transformers import BertForSequenceClassification

model_path = 'models/fine_tuned_model.pth'
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(mlb.classes_))

OSError: models/fine_tuned_model.pth is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

# 3. Load the spaCy model for keyword extraction
This loads the spaCy model for English language text.

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

# 4. Define the pipeline steps
This defines the pipeline function, which takes an overview string as input and returns a list of predicted genres.

In [None]:
def pipeline(overview: str) -> list:
    # Extract keywords
    doc = nlp(overview)
    keywords = extract_keywords(doc, num_keywords=5)

    # Classify the movie
    X = [overview]
    X_tokenized = tokenizer.batch_encode_plus(X, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(X_tokenized['input_ids'], attention_mask=X_tokenized['attention_mask'])
    predicted_genres = mlb.inverse_transform((outputs.logits > 0).detach().cpu().numpy())[0]
    
    return predicted_genres


# 5. Extract keywords from the overview
This extracts the top 5 keywords from the movie overview using spaCy.

In [None]:
doc = nlp(overview)
keywords = extract_keywords(doc, num_keywords=5)

## 6. Classify the movie based on the overview

In [None]:
X = [overview]
X_tokenized = tokenizer.batch_encode_plus(X, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(X_tokenized['input_ids'], attention_mask=X_tokenized['attention_mask'])
predicted_genres = mlb.inverse_transform((outputs.logits > 0).detach().cpu().numpy())[0]

## 7. Run the pipeline on a movie overview

In [None]:
overview = 'A young boy in a small town discovers a mysterious egg that hatches into a friendly dragon. But when a mean-spirited hunter sets out to capture the dragon, the boy and his new friend must evade the hunter and protect the dragon.'
predicted_genres = pipeline(overview)
print('Predicted genres:', predicted_genres)