In [10]:
import pandas as pd
import os
import nltk
from pathlib import Path
from transformers import BertTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [11]:
df = pd.read_csv('movie_corpora.csv')

In [12]:
df.head(3)

Unnamed: 0,Movie Name,Corpus
0,10 Cloverfield Lane 2016,movie full suspense makes guess real happens w...
1,10 Things I Hate About You 1999,first day new school cameron falls bianca stra...
2,12 Angry Men 1957,excellent courtroom drama unique twist instead...


In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
# Tokenize the 'Corpus' column
df['Tokenized Corpus'] = df['Corpus'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))

In [15]:
# Assuming you have already tokenized the data and it's stored in the 'Tokenized Corpus' column

# Save the DataFrame with the tokenized data to a new CSV file
df.to_csv('tokenized_movie_reviews.csv', index=False)

In [16]:
# Step 2: Loading Pretrained Model
from transformers import BertModel
import torch

In [17]:
# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

In [18]:
# Step 3: Embedding Extraction

# Convert token IDs to tensors
input_ids = torch.tensor(df['Tokenized Corpus'].tolist())

In [19]:
# Forward pass through the model to get embeddings
with torch.no_grad():
    outputs = model(input_ids)

In [20]:
# Extract embeddings from the last layer
embeddings = outputs.last_hidden_state[:, 0, :].numpy()

In [21]:
# Step 4: Similarity Computation
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Compute cosine similarity between movies
similarity_matrix = cosine_similarity(embeddings)

In [23]:
# Step 5: Clustering or Visualization
from sklearn.cluster import KMeans

In [24]:
# Cluster the movies
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


In [25]:
# Step 6: Interpretation and Evaluation
# You can examine the clusters to see if they make sense in terms of movie similarity

# Step 7: Application
# Use the similarity analysis for your desired applications (e.g., movie recommendations)

# Optionally, you can save the results back to the DataFrame
df['Cluster'] = clusters

# Optionally, save the DataFrame with clusters
df.to_csv('movies_with_clusters.csv', index=False)