## **Content based Recommendation System on a Movie dataset**

# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Trying out CountVectorizer and cosine_similarity

In [None]:
text = ["mumbai pune mumbai","pune pune mumbai"]
cv = CountVectorizer()

count_matrix = cv.fit_transform(text)

#print(count_matrix.toarray())
similarity_scores = cosine_similarity(count_matrix)

print(similarity_scores)
# text[0] is similar to text[0] so op 1
# text[0] - text[1] = similar by 0.8 (cos theta betwn 1st and 2nd txt is 0.8)
# text[1] - text[0] = similar by 0.8 
# text[1] is similar to text[1] so op 1

[[1.  0.8]
 [0.8 1. ]]


# **Required functions**

In [None]:
def get_index_from_title(title):
  if df[]
	return df[df.title == title]["index"].values[0]

def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_releasedate_from_index(index):
  return df[df.index == index]["release_date"].values[0]

# **Reading the Dataset**

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/chinmayee521/Data/main/movie_dataset.csv')
print(df.columns)

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


# **Selecting Features**

In [None]:
features = ['keywords','cast','genres','director']

In [None]:
#creating a column in df such that all features are combined

#in keywords feature there is NaN-so to get rid of NaN values by fillna-fills all na values with empty string
for feature in features:
	df[feature] = df[feature].fillna('')

def combinefeatures(row):
	try:
		return row['keywords'] +" "+row['cast']+" "+row["genres"]+" "+row["director"]
	except:
		print("Error:", row)
#applying function to all the rows of df using apply method
df["combined_features"] = df.apply(combinefeatures,axis=1) #pass each row individually

print(df['combined_features'])

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: combined_features, Length: 4803, dtype: object


# **Create count matrix for new combined_features column**

In [None]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

# **Computing Cosine Similarity based on the count_matrix**

In [None]:
cos_similarity = cosine_similarity(count_matrix) 

In [None]:
#user inputs the movie
user_input_movie = "Interstellar"

# **Get index of user_input_movie from its title**

In [None]:
 movie_index = get_index_from_title(user_input_movie)

In [None]:
'''
return indices of all similar movies of movie_index of user_input_movie in desc order
step 1- get the full row based on movie_index [1,0.5,0.8]
step 2- enumerate the list- keep track of indices of all the elements [(0,1),(1,0.5),(2,0.8)] list of tuples
step 3- sort the list of tuples based on similarity scores [(0,1),(2,0.8)(1,0.5)] for mv0 most similar mv is mv2,next similar is mv1
mv  0   1   2
  ____________
0|  1 0.5  0.8    mv 0 -> index returned - 0,2,1
1|
2|
'''

'\nreturn indices of all similar movies of movie_index of user_input_movie in desc order\nstep 1- get the full row based on movie_index [1,0.5,0.8]\nstep 2- enumerate the list- keep track of indices of all the elements [(0,1),(1,0.5),(2,0.8)] list of tuples\nstep 3- sort the list of tuples based on similarity scores [(0,1),(2,0.8)(1,0.5)] for mv0 most similar mv is mv2,next similar is mv1\nmv  0   1   2\n  ____________\n0|  1 0.5  0.8    mv 0 -> index returned - 0,2,1\n1|\n2|\n'

In [None]:
# Finding the similar movies
similar_movies =  list(enumerate(cos_similarity[movie_index]))


In [None]:
#Getting similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True) #key->for second element of tuple, reverse=true for desc order

In [None]:
print(sorted_similar_movies)

[(95, 1.0000000000000007), (123, 0.32732683535398854), (2375, 0.32102894156205125), (125, 0.31622776601683794), (270, 0.31426968052735443), (3439, 0.31426968052735443), (149, 0.30792014356780045), (634, 0.3019405424385589), (108, 0.29629629629629634), (3688, 0.2765204519281134), (1358, 0.27498597046143514), (43, 0.2694301256218254), (2433, 0.2694301256218254), (1446, 0.26419797463373906), (3, 0.264197974633739), (93, 0.2592592592592593), (813, 0.2592592592592593), (14, 0.2545875386086578), (673, 0.2545875386086578), (870, 0.2545875386086578), (3679, 0.25197631533948484), (257, 0.24077170617153845), (583, 0.24077170617153845), (2862, 0.24077170617153845), (1296, 0.2381448361039201), (577, 0.23570226039551584), (720, 0.23570226039551584), (2068, 0.23103442669455732), (10, 0.23094010767585035), (363, 0.23094010767585035), (3373, 0.22680460581325726), (212, 0.22645540682891915), (365, 0.22645540682891915), (922, 0.22222222222222224), (545, 0.2182178902359924), (1286, 0.2182178902359924), (

In [None]:
sorted_similar_movies = sorted_similar_movies[1:16]

In [None]:
sorted_movie_indices = [get_title_from_index(i[0]) for i in sorted_similar_movies]
print(sorted_movie_indices)

['Midnight Special', 'The Matrix Reloaded', 'The Martian', 'The Terminator', 'Armageddon', 'The Matrix', 'Terminator Genisys', 'Dear Frankie', 'Austin Powers: The Spy Who Shagged Me', 'Terminator Salvation']


In [None]:
sorted_movie_releasedates = [get_releasedate_from_index(i[0]) for i in sorted_similar_movies]
print(sorted_movie_releasedates)

['2014-11-05', '2003-11-05', '2016-02-18', '2003-05-15', '2015-09-30', '1984-10-26', '1998-07-01', '1999-03-30', '2015-06-23', '2004-05-18', '1999-06-08', '2009-05-20', '1987-07-23', '2011-05-18', '2012-07-16', '2003-07-02', '1978-12-13', '2013-06-12', '2002-07-26', '1980-12-04', '2011-11-25', '2011-09-28', '2003-12-25', '2013-08-16', '1983-06-17', '2004-08-12', '2011-09-08', '2007-02-09', '2006-06-28', '2001-06-29', '2001-04-12', '2004-05-26', '1997-07-11', '2009-03-19', '2000-11-13', '2013-08-01', '2010-02-19', '2003-01-01', '2002-07-03', '1986-11-25', '2006-10-15', '2015-01-21', '2011-05-24', '2006-10-19', '2013-12-18', '2008-03-11', '2012-09-21', '2014-02-07', '2007-04-05', '2013-02-21', '2012-02-01', '2005-06-08', '1980-10-02', '2009-06-12', '1997-06-13', '2007-12-14', '2006-09-22', '1985-07-03', '2007-11-21', '1968-04-10', '1988-07-16', '2014-10-17', '2012-08-28', '2007-01-19', '1994-12-16', '2003-07-11', '2011-03-08', '1990-05-25', '1979-12-06', '1995-12-01', '1983-06-03', '2009

# **Print first 50 similar movies**

In [None]:
i=0
for element in sorted_similar_movies:
		print(get_title_from_index(element[0]))
		i=i+1
		if i>50:
			break

Interstellar
The Matrix Revolutions
Midnight Special
The Matrix Reloaded
The Martian
The Terminator
Armageddon
The Matrix
Terminator Genisys
Dear Frankie
Austin Powers: The Spy Who Shagged Me
Terminator Salvation
Superman IV: The Quest for Peace
The Tree of Life
The Dark Knight Rises
Terminator 3: Rise of the Machines
Superman
Man of Steel
Austin Powers in Goldmember
Superman II
Take Shelter
Real Steel
Big Fish
About Time
Superman III
AVP: Alien vs. Predator
Contagion
Death at a Funeral
Superman Returns
A.I. Artificial Intelligence
The Other Side of Heaven
The Day After Tomorrow
Contact
Knowing
Unbreakable
Snowpiercer
The Killer Inside Me
Flywheel
Men in Black II
Star Trek IV: The Voyage Home
The Astronaut Farmer
Ex Machina
X-Men: First Class
The Prestige
Her
Stargate: The Ark of Truth
House at the End of the Street
Mr. Peabody & Sherman
Sunshine
Snitch
Chronicle


.

.

.

Dataset from Kaggle (TMDB5000 DATASET): https://www.kaggle.com/tmdb/tmdb-movie-metadata