In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [2]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [3]:
df = pd.read_csv('movie_dataset.csv')

In [14]:
features = ['keywords', 'cast', 'genres', 'director']
for feature in features:
    df[feature].fillna('', inplace=True)

In [15]:
def combine_features(row):
        try:
            return row['keywords'] + " " +row['cast'] + " " +row['genres']+" "+row["director"]
        except:
            print("Error: ", row)

df["combined_features"] = df.apply(combine_features, axis=1)
df["combined_features"]

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: combined_features, Length: 4803, dtype: object

In [16]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])


In [18]:
cosine_sim = cosine_similarity(count_matrix)
cosine_sim

array([[1.        , 0.10540926, 0.12038585, ..., 0.        , 0.        ,
        0.        ],
       [0.10540926, 1.        , 0.0761387 , ..., 0.03651484, 0.        ,
        0.        ],
       [0.12038585, 0.0761387 , 1.        , ..., 0.        , 0.11145564,
        0.        ],
       ...,
       [0.        , 0.03651484, 0.        , ..., 1.        , 0.        ,
        0.04264014],
       [0.        , 0.        , 0.11145564, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04264014, 0.        ,
        1.        ]])

In [50]:
movie_users_like = "Iron Man"

In [51]:
movie_index = get_index_from_title(movie_users_like)

In [52]:
similar_movies = list(enumerate(cosine_sim[movie_index]))
similar_movies

[(0, 0.15396007178390023),
 (1, 0.10954451150103323),
 (2, 0.08340576562282992),
 (3, 0.03922322702763681),
 (4, 0.16000000000000003),
 (5, 0.08000000000000002),
 (6, 0.04588314677411235),
 (7, 0.40166320883712187),
 (8, 0.04264014327112209),
 (9, 0.16329931618554525),
 (10, 0.20000000000000004),
 (11, 0.08000000000000002),
 (12, 0.08340576562282992),
 (13, 0.08000000000000002),
 (14, 0.2267786838055363),
 (15, 0.03922322702763681),
 (16, 0.3951316644589048),
 (17, 0.08528028654224418),
 (18, 0.10954451150103323),
 (19, 0.12792042981336627),
 (20, 0.16329931618554525),
 (21, 0.08164965809277262),
 (22, 0.04588314677411235),
 (23, 0.044721359549995794),
 (24, 0.07071067811865475),
 (25, 0.0),
 (26, 0.37796447300922725),
 (27, 0.16329931618554525),
 (28, 0.20412414523193156),
 (29, 0.08528028654224418),
 (30, 0.15689290811054724),
 (31, 0.4800000000000002),
 (32, 0.03779644730092272),
 (33, 0.264575131106459),
 (34, 0.0),
 (35, 0.17457431218879393),
 (36, 0.16329931618554525),
 (37, 0.04

In [53]:
sorted_similar_movies = sorted(similar_movies, key= lambda x:x[1], reverse = True)

In [54]:
sorted_similar_movies

[(68, 1.0000000000000007),
 (79, 0.6047431568147635),
 (31, 0.4800000000000002),
 (7, 0.40166320883712187),
 (16, 0.3951316644589048),
 (26, 0.37796447300922725),
 (85, 0.3411211461689767),
 (182, 0.30792014356780045),
 (511, 0.30792014356780045),
 (203, 0.30237157840738177),
 (4401, 0.30000000000000004),
 (101, 0.2984810028978546),
 (174, 0.27852424952911653),
 (33, 0.264575131106459),
 (46, 0.2599734734478726),
 (242, 0.25584085962673253),
 (122, 0.25021729686848976),
 (94, 0.24000000000000005),
 (232, 0.24000000000000005),
 (39, 0.23533936216582085),
 (64, 0.23533936216582085),
 (169, 0.23533936216582085),
 (607, 0.23533936216582085),
 (661, 0.23533936216582085),
 (3623, 0.22941573387056174),
 (14, 0.2267786838055363),
 (126, 0.2267786838055363),
 (788, 0.2267786838055363),
 (870, 0.2267786838055363),
 (91, 0.22360679774997896),
 (1740, 0.22283440581246222),
 (2442, 0.21908902300206642),
 (166, 0.21821789023599242),
 (507, 0.21821789023599242),
 (205, 0.21320071635561044),
 (2390, 0

In [55]:
i = 0
for movie in sorted_similar_movies:
    print(get_title_from_index(movie[0]))
    if i>50:
        break
    i+=1

Iron Man
Iron Man 2
Iron Man 3
Avengers: Age of Ultron
The Avengers
Captain America: Civil War
Captain America: The Winter Soldier
Ant-Man
X-Men
X2
The Helix... Loaded
X-Men: First Class
The Incredible Hulk
X-Men: The Last Stand
X-Men: Days of Future Past
Fantastic Four
X-Men Origins: Wolverine
Guardians of the Galaxy
The Wolverine
TRON: Legacy
X-Men: Apocalypse
Captain America: The First Avenger
Sky Captain and the World of Tomorrow
Zathura: A Space Adventure
Made
Man of Steel
Thor: The Dark World
Deadpool
Superman II
Independence Day: Resurgence
Kick-Ass 2
Southland Tales
G.I. Joe: Retaliation
Independence Day
Sherlock Holmes: A Game of Shadows
Red Sonja
Stargate: The Ark of Truth
The Lost World: Jurassic Park
Jurassic World
Sherlock Holmes
Blade: Trinity
Dragonball Evolution
A Scanner Darkly
Superman Returns
Spy Kids 3-D: Game Over
The Black Hole
The Amazing Spider-Man 2
Green Lantern
The Core
The Shadow
RoboCop 3
Beastmaster 2: Through the Portal of Time
