In [3]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import seaborn as sns

In [4]:
movie_df = pd.read_csv('./data/movies.csv')
movie_df.keys()

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [5]:
movie_df.describe(
)

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [6]:
movie_df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [7]:
movie_df.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [8]:
movie_df.dropna(inplace=True)
movie_df = movie_df[ ['id', 'title', 'genre', 'overview' ] ]#= movie_df['id'], movie_df['title'], movie_df['genres'], movie_df['overview'], movie_df['popularity']

In [9]:
movie_df.head()

Unnamed: 0,id,title,genre,overview
0,278,The Shawshank Redemption,"Drama,Crime",Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance","Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Drama,Crime","Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,"Drama,History,War",The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,"Drama,Crime",In the continuing saga of the Corleone crime f...


In [10]:
movie_df['tags'] = movie_df['overview']  + movie_df['genre']
movie_df.drop(columns=['overview', 'genre'], inplace=True)
movie_df

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [11]:
cv = CountVectorizer(max_features=10000, stop_words='english') # max_features=10000, stop_words='english' , converts text into matrix of token counts

In [12]:
vector = cv.fit_transform(movie_df['tags'].values.astype('U')).toarray() # converts text into matrix of token counts
vector.shape

(9985, 10000)

In [13]:
similarity = cosine_similarity(vector)
similarity.shape

(9985, 9985)

In [14]:
similarity[2].argsort()[:-11:-1] # top 10 similar movies
for i in range(0, 5):
    print(movie_df['title'][i])
    print(list(movie_df['title'][similarity[i].argsort()[-2:-12:-1]]))
    print()

The Shawshank Redemption
['Brubaker', 'Maggie Simpson in The Longest Daycare', 'A Prophet', 'Nurse 3-D', 'Cool Hand Luke', 'Chapter 27', 'Empire Records', 'Synecdoche, New York', 'Bad Trip', 'Dark Passage']

Dilwale Dulhania Le Jayenge
['To All the Boys: P.S. I Still Love You', 'Missing in Action', 'The Cameraman', 'Life', 'The Graduate', 'The Broken Circle Breakdown', 'Lifeforce', 'The Awful Truth', 'My Girl 2', "It's All About Karma"]

The Godfather
['The Godfather: Part II', 'Crimson Rivers II: Angels of the Apocalypse', 'Joker', 'Bomb City', 'Nurse 3-D', 'Felon', 'Rope', 'Burn After Reading', 'Youth in Revolt', 'The Big Heat']

Schindler's List
['Resistance', 'The Counterfeiters', 'A Hijacking', 'Storks', 'Deerskin', 'A Hidden Life', 'Below Her Mouth', 'A Man Escaped', 'Black Book', 'Courage Under Fire']

The Godfather: Part II
['The Godfather', 'The Godfather: Part III', 'Nurse 3-D', 'D-Railed', 'Criminal', 'Barton Fink', 'Fracchia The Human Beast', 'The Goldfinch', 'Timbuktu', 'L

In [15]:
distance_godfather =  sorted(list(enumerate(similarity[2])), key=lambda x: x[1], reverse=True)[1:11] # top 10 similar movies
distance_godfather

[(4, 0.4763305116224667),
 (7416, 0.35634832254989923),
 (153, 0.33946736991660215),
 (2624, 0.32732683535398854),
 (9512, 0.31497039417435607),
 (2412, 0.3118047822311618),
 (330, 0.30860669992418377),
 (5008, 0.30304576336566325),
 (779, 0.29957234475763905),
 (7046, 0.29957234475763905)]

In [16]:
# # use clip to encode text embeddings
# from transformers import CLIPProcessor, CLIPModel

# # Load the CLIP model and processor
# model_name = "openai/clip-vit-base-patch32"
# processor = CLIPProcessor.from_pretrained(model_name)
# model = CLIPModel.from_pretrained(model_name)

# # Prepare inputs
# image_path = "path_to_your_image.jpg"
# text = "a photo of a cat"
# inputs = processor(text, images=image_path, return_tensors="pt", padding=True)

# # Get embeddings
# outputs = model(**inputs)
# text_embedding = outputs.text_embeds  # Embedding for the text
# image_embedding = outputs.image_embeds  # Embedding for the image

# # Print embeddings
# print("Text Embedding:", text_embedding)
# print("Image Embedding:", image_embedding)


In [17]:
def recommend(movies):
    index = movie_df[movie_df['title'] == movies].index[0]
    distance_godfather =  sorted(list(enumerate(similarity[index])), key=lambda x: x[1], reverse=True)[1:11] 
    for i in distance_godfather:
        print(movie_df.iloc[i[0]].title)


In [18]:
recommend('The Godfather')

The Godfather: Part II
Blood Ties
Joker
Bomb City
Gotti
Felon
Rope
Batman: The Killing Joke
The Big Heat
The Outsider


In [19]:
pickle.dump(movie_df, open('movie_df.pkl', 'wb'))
pickle.dump(movie_df, open ('similarity.pkl', 'wb'))

In [20]:
pickle.load(open('movie_df.pkl', 'rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...
