## **EXTRACTING ZIP FILE**

In [None]:
from zipfile import ZipFile

# Create a ZipFile Object and load sample.zip in it
with ZipFile('/content/drive/My Drive/movie_recommender.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

## **BASIC INTRO**

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text = ["London Paris London", "Paris Paris London"]

# counts the no of words in sentences.
cv = CountVectorizer()

count_matrix = cv.fit_transform(text)
print(count_matrix.toarray())
# output : 1st array shows 2 same words i.e. London and 1 word Paris in sentence 1. 
# In 2nd array London frequency is just 1 and Paris frequncy is 2 so [1  2]

# Now computing cosine similairty between 2 vectors or 2 arrays (2 arrays mean 2 sentences).
similarity_scores = cosine_similarity(count_matrix)
print("\n Similarity score blw 2 sentences: ")
print(similarity_scores)

#output : 1st array [1.0, 0.8] means 1st sentence or array is compared with itself so it similarity score is 1. secondly 1st sentence or array is compared
# with second sentence so similarity score is 0.8. same procedure goes with array 2. 

[[2 1]
 [1 2]]

 Similarity score blw 2 sentences: 
[[1.  0.8]
 [0.8 1. ]]


# **WORKING ON REAL DATASET**

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
##Step 1: Read CSV File
df = pd.read_csv('/content/movie_dataset.csv')

##Step 2: Select key Features

print(df.columns)
features = ['keywords', 'cast', 'genres', 'director']

##Step 3: Create a column in DF which combines all selected features as a single string. 
def combine_features(row):
  return row["keywords"] + " " + row["cast"] + " " + row["genres"] + " "+ row["director"]
  
# Apply this function/transform to all the rows of the dataframe.
df["combined_features"] = df.apply(combine_features, axis = 1) # axis 1 mean row wise. pass 1 row at a time. 
print(df["combined_features"].head())
# you will get an error due to nan values in the dataset but in order to troubleshoot u use try and catch block to see which row in the dataframe is making trouble
# without stoping the execution loop.


In [42]:
###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]

##################################################

##Step 1: Read CSV File
df = pd.read_csv('/content/movie_dataset.csv')

##Step 2: Select key Features

print(df.columns)
features = ['keywords', 'cast', 'genres', 'director']

##Step 3: Create a column in DF which combines all selected features as a single string. 
def combine_features(row):
  try:
    return row["keywords"] + " " + row["cast"] + " " + row["genres"] + " "+ row["director"]
  except:
    print("Error: ", row)

# so you see the error is occuring due to nan values.
# so lets replace nan values in your selected columns

for feature in features:
  df[feature] = df[feature].fillna('') # filling nan in each column rows with empty string. 

# Apply this function/transform to all the rows of the dataframe.
df["combined_features"] = df.apply(combine_features, axis = 1) # axis 1 mean row wise. pass 1 row at a time. 
print(df["combined_features"].head())
# Now it will work perfectly blc we have replaced nan values in our selective columns with empty strings. 


Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


In [41]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

# Now doing prediction, a movie name is given, get the similar movies to this name.
movie_user_likes = "Avatar"

## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)
print("Movie Index: ", movie_index)

# so when u get movie index u need to go into cosine_sim array, go at the movie index.
similar_mov_array = cosine_sim[movie_index]
similar_mov_array = [round(i, 2) for i in similar_mov_array]
print(similar_mov_array)

# convert the array into a list of sorted numbers like. [(0, 1.0), (1, 0.3), (2, 0.5)...]
similar_movies = list(enumerate(similar_mov_array))
print(similar_movies)

## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies, key = lambda x: x[1], reverse=True)
print(sorted_similar_movies)

## Step 8: Print titles of first 50 movies
i = 0
for movie in sorted_similar_movies:
  print(get_title_from_index(movie[0]))
  i += 1
  if i > 50:
    break


Avatar
Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
Star Trek Beyond
Alien
Lockout
Jason X
Moonraker
The Helix... Loaded
Gravity
Planet of the Apes
Galaxy Quest
Jupiter Ascending
The Wolverine
Alien³
Silent Running
Zathura: A Space Adventure
Cargo
Trekkies
Star Trek
Lost in Space
Babylon A.D.
Wing Commander
Oblivion
The Fifth Element
Titan A.E.
AVP: Alien vs. Predator
Dragonball Evolution
The Empire Strikes Back
John Carter
Superman Returns
Starship Troopers
Divergent
Soldier
The Abyss
Memoirs of an Invisible Man
The Astronaut's Wife
The Black Hole
Machete Kills
Damnation Alley
The Ice Pirates
Captain America: Civil War
Oz: The Great and Powerful
Men in Black
The Time Machine
Star Trek: Insurrection
Space Cowboys
The One
Sheena
