In [None]:
#Importing Libraries

import pandas as pd
import numpy as np

In [None]:
#Loading Data 

credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")
print("credits:",credits.shape)
print("movies:",movies.shape)
print(credits.head())
print(movies.head())

In [None]:
#changeing the column name of movie_id in credits table to id.
#join both the tables on the common column of "id".

Credits_Column_Renamed= credits.rename(index=str, columns={"movie_id":"id"})
merged_movies= movies.merge(Credits_Column_Renamed, on="id")
print(merged_movies.head())

In [None]:
#In the new merged table Three columns have the same data title_x,title_y,orignal_title so we drop 2 of them.
#The status, production_countries and homepage columns are not of use so we drop them.

movies_cleaned=merged_movies.drop(columns=["title_x","title_y","production_countries","status","homepage"])
print(movies_cleaned.head())
print(movies_cleaned.info())

# Content Based Recommender System
I am creating a Recommender system based on the Overview column of the movies, i.e if the user gives a movies name to us we can provide movies which have similar plot summaries.

For this I am going to use tf-idf Statistical method(refer: https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/) on the overview column to generate a matrix of important words. Then Using the sigmoid kernel generate a matrix of the degree of similarity between movies.

"tvf.fit_transform()" Fits the TfidfVectorizer to the preprocessed 'overview' text and transforms it into a TF-IDF matrix. This matrix represents the importance of each word or n-gram in each document (movie overview) within the dataset.
The resultant matrix can be read in the following way:
-The first value in each tuple (i, j) represents the row index (document index, i.e., movie index).
-The second value represents the column index corresponding to a unique word or n-gram in the vocabulary.
-The third value is the TF-IDF score for the term at that row (document) and column (word or n-gram).

note:This representation is a compact way of storing the TF-IDF matrix, focusing only on the non-zero entries to save memory.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv= TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,3), stop_words='english')

# Fitting the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(movies_cleaned['overview'].apply(lambda x: np.str_(x)))
print(tfv_matrix)
print(tfv_matrix.shape)

# Sigmoid kernal

The sigmoid kernel is a type of kernel function commonly used in machine learning, particularly in the context of support vector machines (SVMs) and kernelized methods. Kernels are mathematical functions that measure the similarity between pairs of data points. The sigmoid kernel is defined as:
K(x,y)=tanh(α⋅(x^T)y+c)
Here:
-x and y are the input vectors.
-α is a scaling parameter.
-c is a constant.
The 
tanh
⁡
tanh function is the hyperbolic tangent function, and (x^T)y is the dot product of the vectors x and y. The purpose of the tanh function is to squash the output into the range [-1,1].
In the context of similarity or kernelized methods, the sigmoid kernel measures the similarity between two vectors based on the hyperbolic tangent of a scaled dot product. It is often used when the input data is not linearly separable, and the kernelized method aims to find a decision boundary in a higher-dimensional space.

The sigmoid kernel is used here to capture non-linear relationships and measure the similarity between pairs of movies based on their TF-IDF representations of overviews. The resulting sig matrix is used for identifying similar movies. Higher values in the matrix indicate higher similarity between the corresponding movies.

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

#Prints the first row of the similarity matrix. 
#It showes the similarity values between the first movie (index 0) and all other movies in the dataset.
print(sig[0])

#Each row in the "sig" matrix corresponds to a movie, and the values indicate the similarity between the corresponding movie and all other movies in the dataset. Higher values imply higher similarity.

# Mapping indices and movie names

In [None]:
#first create a dataframe where the index is the 'orignal title' column from movies_cleaned df,
#the next column is the index column of movies_cleaned,drop duplicate movies so that no movie is repeated.
indices = pd.Series(movies_cleaned.index, index=movies_cleaned['original_title']).drop_duplicates()
print(indices)

#now apply sig[] on a index of a movie
#and when the resultant list is reversed we get a list of movies that are most similar to the one we entered in a descending order.
print(indices['Newlyweds'])
print(sig[4799])
print(list(enumerate(sig[indices['Newlyweds']])))
print(sorted(list(enumerate(sig[indices['Newlyweds']])), key=lambda x: x[1], reverse=True))

In [None]:
def give_recomendations(title, sig=sig):
   
    #get index of the movie and convert the matrix with pairwise similarity scores to a list 
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    #sort the list
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    #get scores of the 10 most similar movies and their indices
    sig_scores = sig_scores[1:11]
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_cleaned['original_title'].iloc[movie_indices]

In [None]:
print(give_recomendations('Avatar'))