libraries

In [None]:
import pandas as pd
import pandas as pd 
import numpy as np
import warnings
from ast import literal_eval
import os
import matplotlib.pyplot as plt
import pickle

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
#Import cosine_similarity from scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

warnings.filterwarnings("ignore")


load the movie dataset using pandas.

In [None]:
movies=pd.read_csv('tmdb.csv')

movie file dataframe

In [None]:
movies.head(5)

getting description of information present in csv file

In [None]:
movies.describe()

showing structure of elements in the csv files

In [None]:
movies.shape

displays the The number of entries, number of missing values, and data types for each column.

In [None]:
movies.info()

count the number of missing values in each column 

In [None]:
movies.isnull().sum()

In [None]:
print(movies.head(5))
print(movies.shape)
print(movies.info())
print(movies.isnull().sum())

# POPULARITY FILTERING

In [None]:
demographic_data = movies[['id','original_title', 'cast','crew','genres','keywords','overview','vote_count','vote_average','popularity']]
demographic_data.head(5)

Extract cast, crew, keywords and genre Parse the stringified features into their corresponding python objects

In [None]:
demographic_features = ['cast', 'crew', 'keywords', 'genres']
for feature in demographic_features:
    demographic_data[feature] = demographic_data[feature].apply(literal_eval)

In [None]:
def get_list(x):
    # Return empty list in case of missing/malformed data
    if not isinstance(x, list):
        return []

    names = []
    for i in x:
        if not isinstance(i, dict) or 'name' not in i:
#             print(f"Skipping invalid element: {i}")
            continue
        names.append(i['name'])

    # Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
    if len(names) > 3:
        names = names[:3]

    return names


# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

Define features: new director, cast, genres and keywords

In [None]:
demographic_data['director'] = demographic_data['crew'].apply(get_director)
demographic_features = ['cast','crew' ,'keywords', 'genres']
for feature in demographic_features:
    demographic_data[feature] = demographic_data[feature].apply(get_list)

viewing dataframe of data

In [None]:
demographic_data.head(2)

store obtained data to a csv

Recommendation based on highest ratings calculated using total votes

In [None]:
demographic_data.to_csv(r'highest_movies_database.csv', index=False)

mean vote across the whole report

In [None]:
mean_vote= demographic_data['vote_average'].mean()
mean_vote

minimum_vote, the minimum votes required

In [None]:
minimum_vote= demographic_data['vote_count'].quantile(0.7)
minimum_vote

for a movie to feature, it must have minimum of 581 votes. hence movies without this criteria are filtred out

In [None]:
q_movies = demographic_data.copy().loc[demographic_data['vote_count'] >= minimum_vote]
q_movies.shape

Calculating metric for each qualified movie(1442) using the IMDB formular

In [None]:
def weighted_rating(x, m=minimum_vote, C=mean_vote):
    v = x['vote_count']
    R = x['vote_average']
    
    return (v/(v+m) * R) + (m/(m+v) * C)

New feature 'score' and calculate its value with weighted_rating()

In [None]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

top 10 highest rated movies

In [None]:
q_movies = q_movies.sort_values('score', ascending=False)
q_movies[['id','original_title', 'vote_count', 'vote_average', 'score']].head(10)

plot of top 10 movies based on weighted score

In [None]:
plt.figure(figsize=(12,8))

plt.barh(q_movies['original_title'].head(10),q_movies['score'].head(10), align='center',color='pink')
plt.gca().invert_yaxis()
plt.xlabel("Weighted Score", weight='bold')
plt.title("Best Rated Movies",weight='bold')

top 10 movies based on popularity

In [None]:
pop= demographic_data.sort_values('popularity', ascending=False)

plt.figure(figsize=(12,8))

plt.barh(pop['original_title'].head(10),pop['popularity'].head(10), align='center',color='pink')
plt.gca().invert_yaxis()
plt.xlabel("Popularity Score", weight='bold')
plt.title("Most Popular Movies",weight='bold')

In [None]:
q_best_rated = q_movies[['id','original_title','overview','score','crew','director']]

pickle.dump(q_best_rated, open('best_rated_movies.pkl', 'wb'))
q_best_rated

In [None]:
q_most_popular = pop[['id','original_title','overview','popularity','crew','director']]

pickle.dump(q_most_popular, open('most_popular_movies.pkl', 'wb'))
q_most_popular


# CONTENT-BASED FILTERING

In [None]:
data_content_based = movies[['id','title','overview','cast','genres','keywords','director']]
data_content_based.head()

In [None]:
data_content_based['tags'] = data_content_based['overview']+data_content_based['genres']+data_content_based['director']
data_content_based

TF-IDF Vectorizer

Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a', 'an'

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

Replace NaN with an empty string

In [None]:
data_content_based['tags'] = data_content_based['tags'].fillna('')

Creating the required TF-IDF matrix by fitting and transforming the data

In [None]:
tfidf_matrix = tfidf.fit_transform(data_content_based['tags'].apply(lambda x: np.str_(x)))
tfidf_matrix.shape

Convertint TFIDF matrix to Pandas Dataframe to veiw word frequencies, as seen above there are over 20979 unique words are used to describe 4803 movies

In [None]:
doc_term_matrix = tfidf_matrix.todense()
data_frame_database = pd.DataFrame(doc_term_matrix, columns=tfidf.get_feature_names_out(), index=data_content_based.tags)
data_frame_database.to_csv('movies_database_tfidf.csv', index=True)

In [None]:
data_frame_database.head()

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

dataframe of the similarity matrix with rows and columns as movie titles

In [None]:
similarity_matrix = pd.DataFrame(cosine_sim, columns=data_content_based.title, index=data_content_based.title)
similarity_matrix.head()

Obtaining the top 10 movies similar to the movie from a given title

In [None]:
indices = pd.Series(data_content_based.index, index=data_content_based['title']).drop_duplicates()
indices

Function that takes in movie title as input and outputs most similar movies

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies in descending order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 12 most similar movies ignoring the first one as it is itself movie
    sim_scores = sim_scores[1:13]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data_content_based['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Godfather', cosine_sim)

In [None]:
pickle.dump(data_content_based, open('movies_list_part1.pkl', 'wb'))
pickle.load(open('movies_list_part1.pkl', 'rb'))

In [None]:
pickle.dump(cosine_sim, open('similarity_part1.pkl', 'wb'))
pickle.load(open('similarity_part1.pkl', 'rb'))

The above has recommendations based on genre

Modification of above code on type of recommendation based of top 3 actors, director, top 3 genres and top 3 keywords of the given movie.

In [None]:
data_content_based.head()

cleaning data i.e all lower case and remove spaces between names

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

cleaning data in -features might need to change features name because its already being used in the demographic filtering mode

In [None]:
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    data_content_based[feature] = data_content_based[feature].apply(clean_data)

In [None]:
def create_combined_features(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
data_content_based['combined_features'] = data_content_based.apply(create_combined_features, axis=1)

creating a word vector of the entire corpus and provides the frequency of the each word in the document.

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data_content_based['combined_features'])
count_matrix.shape

(movies,unique words )

Converting count matrix to Pandas Dataframe to view word frequencies

In [None]:
doc_term_matrix = count_matrix.todense()
data_frame_database2 = pd.DataFrame(doc_term_matrix, columns=count.get_feature_names_out(), index=data_content_based.combined_features)
data_frame_database2.to_csv(os.path.join(r'movies_database_countmatrix.csv'), index=True)

In [None]:
data_frame_database2.head()

Computing Similarity Score using Cosine Similarity

In [None]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
cosine_sim2

Creating dataframe of the similarity matrix.

In [None]:
sim2 = pd.DataFrame(cosine_sim2, columns=data_content_based.title, index=data_content_based.title)
sim2.head()

Reseting index of our main DataFrame and construct reverse mapping as before

In [None]:
data_content_based = data_content_based.reset_index()
indices = pd.Series(data_content_based.index, index=data_content_based['title'])

In [None]:
get_recommendations('The Godfather', cosine_sim2)

In [None]:
pickle.dump(data_content_based, open('movies_list_part2.pkl', 'wb'))

In [None]:
pickle.load(open('movies_list_part2.pkl', 'rb'))

In [None]:
pickle.dump(cosine_sim2, open('similarity_part2.pkl', 'wb'))

In [None]:
pickle.load(open('similarity_part2.pkl', 'rb'))

# from main_mrs

In [None]:
cv=CountVectorizer(max_features=10000, stop_words='english')

In [None]:
cv

In [None]:
new_data=data_content_based

In [None]:
vector=cv.fit_transform(new_data['tags'].values.astype('U')).toarray()

In [None]:
vector.shape

In [None]:
similarity=cosine_similarity(vector)

In [None]:
similarity

In [None]:
new_data[new_data['title']=="The Godfather"].index[0]

In [None]:
distance = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vector:vector[1])
for i in distance[0:12]:
    print(new_data.iloc[i[0]].title)

In [None]:
def recommand(movies):
    index=new_data[new_data['title']==movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:12]:
        print(new_data.iloc[i[0]].title)

In [None]:
recommand("The Godfather")

In [None]:
pickle.dump(new_data, open('movies_list.pkl', 'wb'))

In [None]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))