## YouTube Link to the Video
###https://youtu.be/gtymDEKRr4A

## Connect the Colab File with Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Import all the required packages
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the movies.csv file into a Pandas dataframe
# Make sure that the file path is correct.
# The path should start with '/content/gdrive/MyDrive' and lead to the location where you saved the file.
movies = pd.read_csv('/content/gdrive/MyDrive/datasets/ml-25m/movies.csv')

# Update the other file paths to ensure consistency and accuracy.
genome_tags = pd.read_csv('/content/gdrive/MyDrive/datasets/ml-25m/genome-tags.csv')
genome_scores = pd.read_csv('/content/gdrive/MyDrive/datasets/ml-25m/genome-scores.csv')
# Assuming the file is in the same ml-25m folder. If not adjust the path accordingly.

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
genome_tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [None]:
genome_scores

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.06250
3,1,4,0.07575
4,1,5,0.14075
...,...,...,...
15584443,206499,1124,0.11000
15584444,206499,1125,0.04850
15584445,206499,1126,0.01325
15584446,206499,1127,0.14025


In [None]:
# Merge the genome_scores dataframe with the genome_tags dataframe to get the relevance score for each tag
merged_genome = genome_scores.merge(genome_tags, on='tagId', how='left')

In [None]:
merged_genome

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.02875,007
1,1,2,0.02375,007 (series)
2,1,3,0.06250,18th century
3,1,4,0.07575,1920s
4,1,5,0.14075,1930s
...,...,...,...,...
15584443,206499,1124,0.11000,writing
15584444,206499,1125,0.04850,wuxia
15584445,206499,1126,0.01325,wwii
15584446,206499,1127,0.14025,zombie


In [None]:
# Filter the merged_genome dataframe to only include the top 20 tags with highest scores for each movie
top_tags = merged_genome[merged_genome['relevance'] > 0.5]
top_tags.reset_index(drop=True, inplace=True)

In [None]:
top_tags

Unnamed: 0,movieId,tagId,relevance,tag
0,1,11,0.58025,3d
1,1,19,0.66250,action
2,1,29,0.89375,adventure
3,1,30,0.67625,affectionate
4,1,61,0.61750,animal movie
...,...,...,...,...
614418,206499,972,0.60600,storytelling
614419,206499,992,0.51225,suprisingly clever
614420,206499,1008,0.52500,talky
614421,206499,1014,0.59775,teen movie


In [None]:
# Group the `top` dataframe by `movieId` and join the `tag` values in the `tag` column separated by a comma
grouped_tags = top_tags.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()

# Merge the `movies` dataframe with the `grouped_tags` dataframe
final_df = movies.merge(grouped_tags, on='movieId', how='left')

# Select only the desired columns in the final dataframe
final_df = final_df[['movieId', 'title', 'genres', 'tag']]
final_df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"3d, action, adventure, affectionate, animal mo..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"action, adaptation, adventure, animals, bad cg..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"comedy, crappy sequel, destiny, good, good seq..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"adaptation, adultery, based on a book, chase, ..."
4,5,Father of the Bride Part II (1995),Comedy,"catastrophe, chase, comedy, cute, cute!, desti..."
...,...,...,...,...
62418,209157,We (2018),Drama,
62419,209159,Window of the Soul (2001),Documentary,
62420,209163,Bad Poems (2018),Comedy|Drama,
62421,209169,A Girl Thing (2001),(no genres listed),


In [None]:
def add_genres_to_tag(row):
    if pd.isnull(row['tag']):
        return row['genres'].replace("|", ",")
    else:
        return row['tag'] + "," + row['genres'].replace("|", ",")

final_df['tag'] = final_df.apply(lambda row: add_genres_to_tag(row), axis=1)

In [None]:
final_df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"3d, action, adventure, affectionate, animal mo..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"action, adaptation, adventure, animals, bad cg..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"comedy, crappy sequel, destiny, good, good seq..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"adaptation, adultery, based on a book, chase, ..."
4,5,Father of the Bride Part II (1995),Comedy,"catastrophe, chase, comedy, cute, cute!, desti..."
...,...,...,...,...
62418,209157,We (2018),Drama,Drama
62419,209159,Window of the Soul (2001),Documentary,Documentary
62420,209163,Bad Poems (2018),Comedy|Drama,"Comedy,Drama"
62421,209169,A Girl Thing (2001),(no genres listed),(no genres listed)


In [None]:
# Extract the movie titles and tags into separate lists
titles = final_df['title'].tolist()
tags = final_df['tag'].str.strip().str.split(",").tolist()


# Create a bag of words representation of the movie tags
def create_bow(tag_list):
    bow = {}
    if not isinstance(tag_list, float):
        for tag in tag_list:
            bow[tag] = 1
    return bow

In [None]:
# Create a list of bags of words representations of the movie tags
bags_of_words = [create_bow(movie_tags) for movie_tags in tags]

In [None]:
# Create a dataframe to store the bags of words representation of the movie tags
tag_df = pd.DataFrame(bags_of_words, index=titles).fillna(0)

In [None]:
tag_df

Unnamed: 0,3d,action,adventure,affectionate,animal movie,animals,animated,animation,based on book,buddy movie,...,ghosts,great ending,capitalism,cynical,mentor,casino,future,fighting the system,foul language,drugs
Toy Story (1995),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji (1995),0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
We (2018),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Window of the Soul (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bad Poems (2018),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Girl Thing (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Calculate the cosine similarity matrix between the movies
cosine_similarity = cosine_similarity(tag_df)

# Create a dataframe with the cosine similarity scores
similarity_df = pd.DataFrame(cosine_similarity, index=tag_df.index, columns=tag_df.index)

In [None]:
# Ask the user for a movie they like
movie = input('Enter a movie you like: ')

Enter a movie you like: Grumpier Old Men (1995)	


In [None]:
# Check if the movie exists in the dataset
if movie not in  similarity_df.index:
    print(f"Movie '{movie}' not found in the dataset. Please check the title and try again.")
else:
  # Find the index of the movie in the similarity dataframe
  movie_index = similarity_df.index.get_loc(movie)

  # Get the top 5 most similar movies to the movie
  top_10 = similarity_df.iloc[movie_index].sort_values(ascending=False)[1:11]

  # Print the top 5 most similar movies to the movie
  print(f'Top 10 similar movies to {movie}:')
  print(top_10)