In [6]:
# Modules
import pandas as pd
import numpy as np
from collections import Counter

In [7]:
# Load the data into a dataframe
df = pd.read_csv('data_prml.csv')

In [8]:
# For Title and tmdbId
df.dropna(inplace=True)

# For genre, Director, Stars and Keywords
df = df[df['genres'] != "['(no genres listed)']"]
df = df[df['Director'] != "['']"]
df = df[df['Stars'] != "['']"]
df = df[df['Keywords'] != "['']"]

# Year
all_years = []
for i in df['Year']:
    all_years.append(i)

# Corresponding counts and filtering
counts = Counter(all_years)
# Get years in which atleast 10 movies have released
filtered_counts = {key: value for key, value in counts.items() if value >= 10}
filtered_keys = filtered_counts.keys()
indii = []
for i in range(len(all_years)):
    if all_years[i] in filtered_keys:
        indii.append(i)
df = df.iloc[indii]

In [9]:
# Visualisation of dataset
df.sample(5)

Unnamed: 0,movieId,genres,Title,Year,imdbId,tmdbId,Director,Stars,Keywords
311,353,"['Action', 'Crime', 'Fantasy', 'Thriller']","Crow, The",1994,109506,9495.0,['Alex Proyas'],"['Brandon Lee', 'Rochelle Davis', 'Ernie Hudso...","['man', 'brutally', 'murdered', 'comes', 'back..."
1220,1620,"['Crime', 'Drama', 'Mystery', 'Thriller']",Kiss the Girls,1997,119468,9437.0,['Gary Fleder'],"['Morgan Freeman', 'Ashley Judd', 'Cary Elwes'...","['police', 'hunting', 'serial', 'kidnapper', '..."
5342,8907,"['Animation', 'Children', 'Comedy']",Shark Tale,2004,307453,10555.0,"['Bibo Bergeron', 'Vicky Jenson', 'Rob Letterm...","['Will Smith', 'Robert De Niro', 'Renée Zellwe...","['son', 'gangster', 'shark', 'boss', 'accident..."
6506,53466,"['Adventure', 'Crime', 'Thriller']",Nancy Drew,2007,479500,14043.0,['Andrew Fleming'],"['Emma Roberts', 'Craig Gellis', 'Rich Cooper'...","['teen', 'detective', 'nancy', 'drew', 'accomp..."
2014,2683,"['Action', 'Adventure', 'Comedy']",Austin Powers: The Spy Who Shagged Me,1999,145660,817.0,['Jay Roach'],"['Mike Myers', 'Heather Graham', 'Michael York...","['dr', 'evil', 'back', 'invented', 'new', 'tim..."


## Text Vectorisation
* In this model I will be using text vectorisation to compute similarity between movies
* Approach :
1) Keep relevant columns and club them to make a paragraph which has most of the context of the movie
2) I have already removed stopwords during the pre-processing so I will not be doing that here
3) I will be using stemming and lemmatization methods to make similarity checking even more meaningful and easier
4) Will compare both the methods and display the results
5) Finally I will be applying text vectorisation

* movieId, imdbId, tmdbId -> Irrelevant unique for each movie = dropping
* genres -> Keeping
* Title -> Keeping
* Year -> dropping. Consider a movie has a sequel in like next 10 years that would decrese similarity and like as these are like in a span on 25 years it really doesn't matter
* Director, stars -> Keeping
* Keywords -> Keeping

In [10]:
# Converting each column into usable format
# genres
Genre = []
for i in df['genres']:
    i = i.replace('[', '').replace(']', '').replace(',', '').replace('"', '').replace("'", '')
    i = i.lower()
    Genre.append(i)
df['genres'] = Genre

# Director
dir = []
for i in df['Director']:
    i = i.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
    dd = ''
    le = len(i.split(', '))
    for j in i.split(', '):
        le -= 1
        j = j.replace(' ', '')
        j = j.lower()
        dd += j
        if(le != 0):
            dd += ' '
    dir.append(dd)
df['Director'] = dir

# Stars
sta = []
for i in df['Stars']:
    i = i.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
    dd = ''
    le = len(i.split(', '))
    for j in i.split(', '):
        le -= 1
        j = j.replace(' ', '')
        j = j.lower()
        dd += j
        if(le != 0):
            dd += ' '
    sta.append(dd)
df['Stars'] = sta

* Stemming is a process that stems or removes last few characters from a word, often leading to incorrect meanings and spelling.

In [11]:
import nltk
from nltk.stem import PorterStemmer

# Initialize the Porter stemmer
stemmer = PorterStemmer()

kw1 = []

for i in df['Keywords']:
    i = i.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
    dd = ''
    le = len(i.split(', '))
    for j in i.split(', '):
        le -= 1
        j = stemmer.stem(j)
        dd += j
        if(le != 0):
            dd += ' '
    kw1.append(dd)

ModuleNotFoundError: No module named 'nltk'

* Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma.

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

kw2 = []

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to convert the part of speech tag returned by nltk.pos_tag to WordNet format
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
for i in df['Keywords']:
    i = i.replace('[', '').replace(']', '').replace('"', '').replace("'", '')
    dd = ''
    le = len(i.split(', '))
    for j in i.split(', '):
        tag = get_wordnet_pos(j)
        le -= 1
        j = lemmatizer.lemmatize(j, pos = tag)
        dd += j
        if(le != 0):
            dd += ' '
    kw2.append(dd)

In [None]:
from sklearn.metrics import accuracy_score

# Checking performance similarity
accuracy = accuracy_score(kw1, kw2)
print("percentage difference :", accuracy * 100)

percentage difference : 0.609115731989078


In [None]:
# Just take any of the two
df['Keywords'] = kw2

# Drop the unwanted columns
df = df.drop(columns =['movieId', 'Year', 'imdbId', 'tmdbId'])

In [None]:
# checking
df

Unnamed: 0,genres,Title,Director,Stars,Keywords
0,adventure animation children comedy fantasy,Toy Story,johnlasseter,tomhanks timallen donrickles jimvarney,cowboy doll profoundly threatened jealous new ...
1,adventure children fantasy,Jumanji,joejohnston,robinwilliams jonathanhyde kirstendunst bradle...,two kid find play magical board game release m...
2,comedy romance,Grumpier Old Men,howarddeutch,waltermatthau jacklemmon sophialoren ann-margret,john max resolve save beloved bait shop turnin...
3,comedy drama romance,Waiting to Exhale,forestwhitaker,whitneyhouston angelabassett lorettadevine lel...,based terry mcmillans novel film follows four ...
4,comedy,Father of the Bride Part II,charlesshyer,stevemartin dianekeaton martinshort kimberlywi...,george bank must deal daughter pregnancy also ...
...,...,...,...,...,...
9737,action animation comedy fantasy,Black Butler: Book of the Atlantic,noriyukiabe stephenhoff,brynapprill dawnm,bennett justin briner jessica cavanagh young l...
9738,animation comedy fantasy,No Game No Life: Zero,atsukoishizuka,alexandrabedford jessicaboone ricardocontreras...,adaption sixth light novel series follows stor...
9739,drama,Flint,bruceberesford,robmorrow marinireland lyndiegreenwood queenla...,woman deal toxic water scandal flint michigan ...
9740,action animation,Bungo Stray Dogs: Dead Apple,takuyaigarashi,brianbeacock raychase luciendodge carriekeranen,armed detective agency investigates bizarre se...


In [None]:
# add the columns to create a big paragraph of content
df["content"] = df["genres"] + " " + df["Director"]+ ' ' + df["Stars"] + " " + df["Keywords"]

# Once added just drop those columns
df = df.drop(columns =['genres', 'Director', 'Stars', 'Keywords'])

# Final data
df

Unnamed: 0,Title,content
0,Toy Story,adventure animation children comedy fantasy jo...
1,Jumanji,adventure children fantasy joejohnston robinwi...
2,Grumpier Old Men,comedy romance howarddeutch waltermatthau jack...
3,Waiting to Exhale,comedy drama romance forestwhitaker whitneyhou...
4,Father of the Bride Part II,comedy charlesshyer stevemartin dianekeaton ma...
...,...,...
9737,Black Butler: Book of the Atlantic,action animation comedy fantasy noriyukiabe st...
9738,No Game No Life: Zero,animation comedy fantasy atsukoishizuka alexan...
9739,Flint,drama bruceberesford robmorrow marinireland ly...
9740,Bungo Stray Dogs: Dead Apple,action animation takuyaigarashi brianbeacock r...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Each movie being represented in the form of a vector
# Each dimension being a word from content

# Initialize CountVectorizer
cv = CountVectorizer(max_features = 5000)

# Fit the CountVectorizer and transform the text data
vectors = cv.fit_transform(df['content']).toarray()

# Retrieve the feature names generated by CountVectorizer
feature_names = cv.get_feature_names_out()

In [None]:
# Top 10 words most frequent in content
print(feature_names[:10])

['aaroneckhart' 'abandoned' 'abbiecornish' 'abducted' 'abelferrara'
 'abigailbreslin' 'ability' 'able' 'aboard' 'abraham']


In [None]:
# Representation
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# To calculate how close the vectors are :
similarity = cosine_similarity(vectors)

In [None]:
# DataFrame representation of cosine similarity
similarity_df = pd.DataFrame(similarity)
similarity_df
import pickle
pickle.dump(similarity,open('similarity_n.pkl','wb'))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9512,9513,9514,9515,9516,9517,9518,9519,9520,9521
0,1.000000,0.147442,0.094281,0.057166,0.071067,0.051434,0.058926,0.158114,0.052705,0.084667,...,0.196116,0.171499,0.051434,0.047140,0.066010,0.205738,0.235702,0.000000,0.149071,0.114332
1,0.147442,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.233126,0.093250,0.037450,...,0.057831,0.050572,0.000000,0.041703,0.000000,0.091003,0.104257,0.000000,0.000000,0.000000
2,0.094281,0.000000,1.000000,0.097014,0.060302,0.000000,0.100000,0.000000,0.000000,0.000000,...,0.055470,0.048507,0.000000,0.040000,0.028006,0.043644,0.100000,0.000000,0.000000,0.194029
3,0.057166,0.000000,0.097014,1.000000,0.073127,0.000000,0.121268,0.000000,0.000000,0.043561,...,0.067267,0.058824,0.052926,0.097014,0.033962,0.052926,0.181902,0.153393,0.000000,0.117647
4,0.071067,0.000000,0.060302,0.073127,1.000000,0.000000,0.075378,0.000000,0.000000,0.000000,...,0.083624,0.073127,0.065795,0.120605,0.000000,0.065795,0.075378,0.095346,0.000000,0.146254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9517,0.205738,0.091003,0.043644,0.052926,0.065795,0.047619,0.054554,0.000000,0.048795,0.039193,...,0.181568,0.158777,0.047619,0.043644,0.030557,1.000000,0.163663,0.000000,0.138013,0.105851
9518,0.235702,0.104257,0.100000,0.181902,0.075378,0.000000,0.062500,0.111803,0.000000,0.000000,...,0.069338,0.121268,0.054554,0.050000,0.175035,0.163663,1.000000,0.000000,0.158114,0.181902
9519,0.000000,0.000000,0.000000,0.153393,0.095346,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.069007,0.063246,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000
9520,0.149071,0.000000,0.000000,0.000000,0.000000,0.069007,0.000000,0.070711,0.070711,0.056796,...,0.087706,0.230089,0.069007,0.000000,0.044281,0.138013,0.158114,0.000000,1.000000,0.076696


In [None]:
# Recommendation based on user input
movie_name = input("Enter movie name : ")

Enter movie name : Toy Story


In [None]:
# Get the index of the movie
index = df[df['Title'] == movie_name].index

# considering there might be newer versions of movies with same name
# Consider the latest one
index = index[-1]

In [None]:
# Get similar
similar_movies = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])

# Considering only the top 5 movies excluding the movie itself
top_similar_movies = similar_movies[1:6]
top_similar_movies

[(2337, 0.44232586846469146),
 (7234, 0.4082482904638631),
 (1393, 0.3689323936863109),
 (5419, 0.36514837167011077),
 (6349, 0.3600411499115478)]

In [None]:
# Split the enumerate tuple
scores = []
movie_names = []
for i, score in top_similar_movies:
    movie_names.append(df.iloc[i].Title)
    scores.append(score)

# Convert it into a dataframe 
Recommendations = pd.DataFrame()
Recommendations['MOVIE'] = movie_names
Recommendations['recommendation_score'] = scores

# Visualisation
Recommendations = Recommendations.reset_index(drop=True)
Recommendations.index = Recommendations.index + 1
Recommendations

Unnamed: 0,MOVIE,recommendation_score
1,Toy Story 2,0.442326
2,Toy Story 3,0.408248
3,Small Soldiers,0.368932
4,"Twelve Tasks of Asterix, The (Les douze travau...",0.365148
5,TMNT (Teenage Mutant Ninja Turtles),0.360041
