#### Importing libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### Helper methods

In [2]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

def combine_features(row):
    return row["keywords"] + " " + row["cast"] + " " + row["genres"] + " " + row["director"]

#### Step 1: Read CSV File

In [3]:
df = pd.read_csv("movie_dataset.csv")

#### Step 2: Select Features

In [4]:
features = ["keywords", "cast", "genres", "director"]

#### Step 3: Create a column in DF which combines all selected features

In [5]:
for feature in features:
    df[feature] = df[feature].fillna("")  # Fill null values
df["combined_features"] = df.apply(combine_features, axis=1)

#### Step 4: Create count matrix from this new combined column

In [6]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

#### Step 5: Compute the Cosine Similarity based on the count_matrix

In [7]:
cosine_sim = cosine_similarity(count_matrix)
movie_user_likes = "Avatar"

#### Step 6: Get index of this movie from its title

In [8]:
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

#### Step 7: Get a list of similar movies in descending order of similarity score

In [9]:
sorted_similar_movies = sorted(
    similar_movies, key=lambda x: x[1], reverse=True)

#### Step 8: Print titles of first 50 movies


In [10]:
i = 0
for movie in sorted_similar_movies:
    if i != 5:
        print(get_title_from_index(movie[0]))
    else:
        break
    i += 1

Avatar
Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
