# Create Embeddings for movies in Movielens dataset
- Using texts from Title, Genres and Tags
- Embedding is created using `all-MiniLM-L12-v2`

In [1]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from platform import python_version
print(f'Python: {python_version()}')

Python: 3.11.12


In [3]:
movielens_dataset = "full"  #"small"

## Step 1: Load all necessary movielens datasets

In [4]:
print("Loading datasets...")
try:
    movies_df = pd.read_csv(f'../../movielens_data/{movielens_dataset}/movies.csv')
    tags_df = pd.read_csv(f'../../movielens_data/{movielens_dataset}/tags.csv')
    print("movies.csv and tags.csv loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure CSV files are in the same directory.")
    exit()

Loading datasets...
movies.csv and tags.csv loaded successfully.


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


## Step 2: Aggregate the tags for each movie

In [7]:
print("Aggregating tags...")
# We only want the text, so let's drop other columns
tags_df = tags_df[['movieId', 'tag']]

print("Aggregating tags with data cleaning...")
aggregated_tags = tags_df.groupby('movieId')['tag'].unique().apply(
    # For each list of tags 'x', create a new list containing only the items that are strings
    lambda x: ', '.join([tag for tag in x if isinstance(tag, str)])
)

Aggregating tags...
Aggregating tags with data cleaning...


In [8]:
# Convert the resulting Series to a DataFrame for merging
aggregated_tags_df = aggregated_tags.to_frame(name='tags').reset_index()
print("Tag aggregation complete.")
print("Example of aggregated tags:")
aggregated_tags_df.head()

Tag aggregation complete.
Example of aggregated tags:


Unnamed: 0,movieId,tags
0,1,"children, Disney, animation, pixar, funny, Pix..."
1,2,"Robin Williams, fantasy, time travel, animals,..."
2,3,"comedinha de velhinhos engraÃƒÂ§ada, comedinha..."
3,4,"characters, slurs, based on novel or book, chi..."
4,5,"Fantasy, pregnancy, remake, family, Steve Mart..."


## Step 3: Merge tags with the main movies DataFrame

In [9]:
print("\nMerging tags into movies DataFrame...")
# Use a 'left' merge to keep all movies, even those without tags
movies_v2_df = pd.merge(movies_df, aggregated_tags_df, on='movieId', how='left')
# Replace NaN (for movies with no tags) with an empty string
movies_v2_df['tags'] = movies_v2_df['tags'].fillna('')
print("Merge complete.")
print("DataFrame head with new 'tags' column:")
print(movies_v2_df.head())


Merging tags into movies DataFrame...
Merge complete.
DataFrame head with new 'tags' column:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  \
0  Adventure|Animation|Children|Comedy|Fantasy   
1                   Adventure|Children|Fantasy   
2                               Comedy|Romance   
3                         Comedy|Drama|Romance   
4                                       Comedy   

                                                tags  
0  children, Disney, animation, pixar, funny, Pix...  
1  Robin Williams, fantasy, time travel, animals,...  
2  comedinha de velhinhos engraÃƒÂ§ada, comedinha...  
3  characters, slurs, based on novel or book, chi...  
4  Fantasy, pregnancy, remake

## Step 4: Create the complete richer text for embedding

In [10]:
def create_enhanced_text(row):
    title = row['title']
    genres = row['genres'].replace('|', ', ')
    tags = row['tags']
    # Combine all three pieces of information
    return f"Title: {title}. Genres: {genres}. Tags: {tags}"

movies_v2_df['text_for_embedding'] = movies_v2_df.apply(create_enhanced_text, axis=1)

In [11]:
print("\nCreated enhanced text for embedding.")
print("Examples of complete texts:")
print("\n" +movies_v2_df.loc[movies_v2_df['movieId'] == 1208, 'text_for_embedding'].iloc[0]) # Apocalypse Now (1979)
print("\n" +movies_v2_df.loc[movies_v2_df['movieId'] == 79132, 'text_for_embedding'].iloc[0]) # Inception (2010)
print("\n" +movies_v2_df.loc[movies_v2_df['movieId'] == 164179, 'text_for_embedding'].iloc[0]) # Arrival (2016)
print("\n" +movies_v2_df.loc[movies_v2_df['movieId'] == 1300, 'text_for_embedding'].iloc[0]) # My Life as a Dog (Mitt liv som hund) (1985)
print("\n" +movies_v2_df.loc[movies_v2_df['movieId'] == 79592, 'text_for_embedding'].iloc[0]) # Other Guys, The (2010)



Created enhanced text for embedding.
Examples of complete texts:

Title: Apocalypse Now (1979). Genres: Action, Drama, War. Tags: dark, descent into madness, Disturbing , hallucinatory, insanity, psychological, Visually Striking, too long, 100 Greatest Movies, Marlon Brando, Oscar (Best Cinematography), Oscar Nominee: Best Picture, New Hollywood, Tumey's DVDs, book was better, classic, cult of masculinity, Long, Nudity (Topless), Vietnam war, disturbing, surreal, Vietnam, adventure, based on a book, Dark, military, violent, boring, sound editing, war, Francis Ford Coppola, Vietnam War, chaos, anti-war, Martin Sheen, Robert Duvall, Joseph Conrad, drama, guerra, gore, movie to see, masculinity, mission, ominous, sweeping, tense, visceral, atmospheric, sound effects, soundtrack, Rain, Dennis Hopper, acting, adapted from:book, based on novel, Best War Films, downward spiral, epic, Francis Ford Copolla, horrors of war, moral transformation, psychology, war drama, Albert Hall, army, Author:

## Step 4: Load the pre-trained Sentence Transformer model
'all-MiniLM-L12-v2' is a great general-purpose model. It's fast and provides high-quality embeddings (384 dimensions).

In [12]:
print("\nLoading the Sentence Transformer model ('all-MiniLM-L12-v2')...")
# The model will be downloaded automatically on the first run.
model = SentenceTransformer('all-MiniLM-L12-v2')
print("Model loaded successfully.")


Loading the Sentence Transformer model ('all-MiniLM-L12-v2')...
Model loaded successfully.


## Step 5: Generate the embeddings

In [13]:
print("\nGenerating embeddings for all movies. This might take a moment...")
# Convert the text column to a list for the model
texts_to_encode = movies_v2_df['text_for_embedding'].tolist()

# Generate embeddings
# The model.encode() method takes a list of strings and returns a list of vectors.
embeddings = model.encode(texts_to_encode, show_progress_bar=True)

print("Embeddings generated successfully.")
# The output 'embeddings' is a numpy array
print(f"Shape of the embeddings array: {embeddings.shape}")


Generating embeddings for all movies. This might take a moment...


Batches:   0%|          | 0/2738 [00:00<?, ?it/s]

Embeddings generated successfully.
Shape of the embeddings array: (87585, 384)


## Step 6: Add the embeddings to the DataFrame

In [14]:
# It's often easier to work with lists in pandas columns than numpy arrays.
movies_v2_df['movie_embedding'] = list(embeddings)

print("\nFinal DataFrame with movie embeddings:")
# Displaying the head, but showing the embedding vector (384 values) for the first movies
print(movies_v2_df[['movieId', 'title', 'movie_embedding']].head())


Final DataFrame with movie embeddings:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                     movie_embedding  
0  [-0.059929866, -0.050398782, 0.1151624, -0.023...  
1  [-0.025187103, 0.038099773, 0.05451808, -0.057...  
2  [-0.073277004, -0.04749465, -0.051540796, -0.0...  
3  [-0.10815668, -0.067874506, -0.033551343, -0.0...  
4  [-0.035251174, -0.011975601, -0.030163156, -0....  


In [15]:
#
# Sanity Check: Verify the dimension of a single embedding
#
first_embedding = movies_v2_df['movie_embedding'].iloc[0]
print(f"\nSanity check: The dimension of a single movie embedding is: {len(first_embedding)}")
if len(first_embedding) == embeddings.shape[1]:
    print("Dimension matches the model output. Ready for similarities")


Sanity check: The dimension of a single movie embedding is: 384
Dimension matches the model output. Ready for similarities


## Step 7: Recommend movies similar to a specific movie

In [16]:
def find_similar_movies(movie_title, top_n=5):
    """
    Finds and prints the top N most similar movies for a given movie title.
    """
    print("-" * 50)
    print(f"Finding top {top_n} similar movies for: '{movie_title}'")

    # Step 1: Find the movie in the DataFrame to get its index
    movie_row = movies_v2_df[movies_v2_df['title'] == movie_title]

    if movie_row.empty:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return

    # The movie's index in the DataFrame corresponds to its row number in the embeddings array
    movie_index = movie_row.index[0]
    movie_vector = embeddings[movie_index].reshape(1, -1)

    # Step 2: Calculate cosine similarity between this movie and all others
    cosine_sim_scores = cosine_similarity(movie_vector, embeddings)

    # Step 3: Pair each movie with its similarity score and sort them
    sim_scores = list(enumerate(cosine_sim_scores[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Step 4: Get the top N results, excluding the first one (which is the movie itself)
    top_movies = sim_scores[1:top_n+1]

    # Step 5: Display the results
    for i, score in top_movies:
        similar_movie_title = movies_v2_df.iloc[i]['title']
        print(f"  - {similar_movie_title} (Similarity: {score:.4f})")
    print("-" * 50)


# List of movies we will analyze
movies_to_analyze = [
    'Apocalypse Now (1979)',
    'Arrival (2016)',
    'Inception (2010)',
    'My Life as a Dog (Mitt liv som hund) (1985)',
    'Other Guys, The (2010)'
]

for title in movies_to_analyze:
    find_similar_movies(title)


--------------------------------------------------
Finding top 5 similar movies for: 'Apocalypse Now (1979)'
  - [REC] 4: Apocalypse (2014) (Similarity: 0.6855)
  - The Apocalypse (1997) (Similarity: 0.6747)
  - X-Men: Apocalypse (2016) (Similarity: 0.6577)
  - LA Apocalypse (2014) (Similarity: 0.6536)
  - Music and Apocalypse (2019) (Similarity: 0.6533)
--------------------------------------------------
--------------------------------------------------
Finding top 5 similar movies for: 'Arrival (2016)'
  - Arrival, The (1996) (Similarity: 0.7140)
  - Approaching the Unknown (2016) (Similarity: 0.6965)
  - The Arrival (2017) (Similarity: 0.6893)
  - Alien Expedition (2018) (Similarity: 0.6681)
  - Conspiracy Encounters (2016) (Similarity: 0.6620)
--------------------------------------------------
--------------------------------------------------
Finding top 5 similar movies for: 'Inception (2010)'
  - The Edge of Dreaming (2010) (Similarity: 0.6904)
  - 2012: An Awakening (2009) (Sim

## Conclusion
We see that the results were a mixed bag. For 'My Life as a Dog (Mitt liv som hund) (1985)', a poignant coming-of-age drama, the model recommended Dog movies. It completely missed the metaphorical "vibe" and instead used the literal keyword "Dog". 