In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import pickle
from functools import reduce

title_principals = "/content/drive/MyDrive/Projet_2/title.principals.tsv"
title_akas = "/content/drive/MyDrive/Projet_2/title.akas.tsv"
title_basics = "/content/drive/MyDrive/Projet_2/title.basics.tsv"
title_crew = "/content/drive/MyDrive/Projet_2/title.crew.tsv"
name_basics = "/content/drive/MyDrive/Projet_2/name.basics.tsv"
title_ratings = "/content/drive/MyDrive/Projet_2/title.ratings.tsv"
tmdb = '/content/drive/MyDrive/Projet_2/tmdb_full.csv'



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# # Filtering:

# title_info
title_info = pd.read_csv(title_basics, sep='\t', usecols= ['tconst', 'originalTitle', 'titleType', 'genres', 'startYear'])

# # directors
directors = pd.read_csv(title_crew, sep='\t', usecols= ['tconst', 'directors'])

# # average rating
average_rating = pd.read_csv(title_ratings, sep='\t', usecols=['tconst', 'averageRating', 'numVotes'])


In [4]:
# # more_info (tmdb)
more_info = pd.read_csv(tmdb, sep=',', usecols= ['imdb_id', 'original_language', 'overview'])

# changing 'imdb_id' column's name to 'tconst'
more_info = more_info.rename(columns={'imdb_id': 'tconst'})

In [5]:
# # merging:

# list of dataframe to merge
dfs = [title_info, directors, average_rating, more_info]

# merging dataframes
merged_df = reduce(lambda left, right: pd.merge(left, right, on='tconst'), dfs)

In [6]:
merged_df_top10K = merged_df.nlargest(10000, 'numVotes')

In [7]:
director_name = pd.read_csv(name_basics, sep='\t', usecols= ['nconst', 'primaryName'])

In [8]:
# # Retrieving Directors names instead of Ids

# Create a dictionary mapping IDs to directors names
director_name_dict = dict(zip(director_name['nconst'], director_name['primaryName']))

# Define the function to get director names
def get_director_names(ids_str):
    ids_list = ids_str.split(',')
    names_list = [director_name_dict.get(id, '') for id in ids_list]
    return ' '.join(names_list)

# Apply the function to create the director_names column
merged_df_top10K['director_names'] = merged_df_top10K['directors'].apply(get_director_names)


In [9]:
merged_df_top10K.head(2)

Unnamed: 0,tconst,titleType,originalTitle,startYear,genres,directors,averageRating,numVotes,original_language,overview,director_names
59071,tt0111161,movie,The Shawshank Redemption,1994,Drama,nm0001104,9.3,2880568,en,Framed in the 1940s for the double murder of h...,Frank Darabont
122640,tt0468569,movie,The Dark Knight,2008,"Action,Crime,Drama",nm0634240,9.0,2863134,en,Batman raises the stakes in his war on crime. ...,Christopher Nolan


In [10]:
# Dropping the 'Directors Id' column:
merged_df_top10K= merged_df_top10K.drop(columns= ['directors'])

# Replacing coma with space in 'genres' column
merged_df_top10K['genres'] = merged_df_top10K['genres'].str.replace(',', ' ')

merged_df_top10K.head()

Unnamed: 0,tconst,titleType,originalTitle,startYear,genres,averageRating,numVotes,original_language,overview,director_names
59071,tt0111161,movie,The Shawshank Redemption,1994,Drama,9.3,2880568,en,Framed in the 1940s for the double murder of h...,Frank Darabont
122640,tt0468569,movie,The Dark Knight,2008,Action Crime Drama,9.0,2863134,en,Batman raises the stakes in his war on crime. ...,Christopher Nolan
163144,tt1375666,movie,Inception,2010,Action Adventure Sci-Fi,8.8,2543775,en,"Cobb, a skilled thief who commits corporate es...",Christopher Nolan
67906,tt0137523,movie,Fight Club,1999,Drama,8.8,2316350,en,A ticking-time-bomb insomniac and a slippery s...,David Fincher
58301,tt0109830,movie,Forrest Gump,1994,Drama Romance,8.8,2250065,en,A man with a low IQ has accomplished great thi...,Robert Zemeckis


In [11]:
merged_df_top10K['original_language'].unique()

array(['en', 'ko', 'fr', 'ja', 'it', 'pt', 'es', 'hi', 'de', 'da', 'zh',
       'fa', 'cn', 'sv', 'ta', 'id', 'te', 'kn', 'ru', 'tr', 'el', 'no',
       'ar', 'pl', 'sr', 'nl', 'nb', 'th', 'fi', 'ro', 'he', 'hu', 'et',
       'bs', 'ml', 'kk', 'bn', 'az', 'is', 'sh', 'tn', 'ur', 'ga', 'ku',
       'uk', 'mk', 'gl', 'cs', 'xx', 'tl', 'mr', 'ka', 'or', 'hy', 'eu'],
      dtype=object)

In [12]:
language_names = {
    'en': 'English',
    'ko': 'Korean',
    'fr': 'French',
    'ja': 'Japanese',
    'it': 'Italian',
    'pt': 'Portuguese',
    'es': 'Spanish',
    'hi': 'Hindi',
    'de': 'German',
    'da': 'Danish',
    'zh': 'Chinese',
    'fa': 'Farsi',
    'cn': 'Chinese',
    'sv': 'Swedish',
    'ta': 'Tamil',
    'id': 'Indonesian',
    'te': 'Telugu',
    'kn': 'Kannada',
    'ru': 'Russian',
    'tr': 'Turkish',
    'el': 'Greek',
    'no': 'Norwegian',
    'ar': 'Arabic',
    'pl': 'Polish',
    'sr': 'Serbian',
    'nl': 'Dutch',
    'nb': 'Norwegian',
    'th': 'Thai',
    'fi': 'Finnish',
    'ro': 'Romanian',
    'he': 'Hebrew',
    'hu': 'Hungarian',
    'et': 'Estonian',
    'bs': 'Bosnian',
    'ml': 'Malayalam',
    'kk': 'Kazakh',
    'bn': 'Bengali',
    'az': 'Azerbaijani',
    'is': 'Icelandic',
    'sh': 'Serbo-Croatian',
    'tn': 'Tswana',
    'ur': 'Urdu',
    'ga': 'Irish',
    'ku': 'Kurdish',
    'uk': 'Ukrainian',
    'mk': 'Macedonian',
    'gl': 'Galician',
    'cs': 'Czech',
    'xx': 'Unknown',
    'tl': 'Tagalog',
    'mr': 'Marathi',
    'ka': 'Georgian',
    'or': 'Oriya',
    'hy': 'Armenian',
    'eu': 'Basque'
}

merged_df_top10K['original_language'] = merged_df_top10K['original_language'].map(language_names)

In [13]:
merged_df_top10K['soup'] = merged_df_top10K[['originalTitle', 'genres', 'original_language', 'overview', 'director_names']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)


In [14]:
merged_df_top10K.head()

Unnamed: 0,tconst,titleType,originalTitle,startYear,genres,averageRating,numVotes,original_language,overview,director_names,soup
59071,tt0111161,movie,The Shawshank Redemption,1994,Drama,9.3,2880568,English,Framed in the 1940s for the double murder of h...,Frank Darabont,The Shawshank Redemption Drama English Framed ...
122640,tt0468569,movie,The Dark Knight,2008,Action Crime Drama,9.0,2863134,English,Batman raises the stakes in his war on crime. ...,Christopher Nolan,The Dark Knight Action Crime Drama English Bat...
163144,tt1375666,movie,Inception,2010,Action Adventure Sci-Fi,8.8,2543775,English,"Cobb, a skilled thief who commits corporate es...",Christopher Nolan,Inception Action Adventure Sci-Fi English Cobb...
67906,tt0137523,movie,Fight Club,1999,Drama,8.8,2316350,English,A ticking-time-bomb insomniac and a slippery s...,David Fincher,Fight Club Drama English A ticking-time-bomb i...
58301,tt0109830,movie,Forrest Gump,1994,Drama Romance,8.8,2250065,English,A man with a low IQ has accomplished great thi...,Robert Zemeckis,Forrest Gump Drama Romance English A man with ...


In [40]:
merged_df_top10K.query('originalTitle.str.contains("", case=False)')

Unnamed: 0,index,tconst,titleType,originalTitle,startYear,genres,averageRating,numVotes,original_language,overview,director_names,soup
83,44764,tt0086250,movie,Scarface,1983,Crime Drama,8.3,914039,English,After getting a green card in exchange for ass...,Brian De Palma,Scarface Crime Drama English After getting a g...
5826,5243,tt0023427,movie,Scarface,1932,Action Crime Drama,7.7,30236,English,"In 1920s Chicago, Italian immigrant and notori...",Howard Hawks Richard Rosson,Scarface Action Crime Drama English In 1920s C...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(merged_df_top10K['soup'])
tfidf_matrix.shape

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

merged_df_top10K = merged_df_top10K.reset_index()
titles = merged_df_top10K['originalTitle']
indices = pd.Series(merged_df_top10K.index, index=merged_df_top10K['originalTitle'])


In [16]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [17]:
# get_recommendations('snatch').head(10)

In [24]:
from fuzzywuzzy import process

def get_recommendations(title):
    # Use fuzzy matching to find the closest matching titles
    closest_match, score = process.extractOne(title, indices.keys())

    # If the similarity score is 100, directly proceed with the recommendation
    if score == 100:
        chosen_title = closest_match
        idx = indices[chosen_title]
    else:
        # Display the closest match to the user
        print("Did you mean:")
        print(f"{closest_match} (Similarity score: {score})")

        # Ask the user to confirm the chosen title
        confirm = input("Is this the title you were looking for? (yes/no): ")

        if confirm.lower() == "yes":
            chosen_title = closest_match
        else:
            # Display the five closest matching titles
            print("\nClosest matching titles:")
            closest_matches = process.extract(title, indices.keys(), limit=10)
            for i, match in enumerate(closest_matches, start=1):
                print(f"{i}. {match[0]} (Similarity score: {match[1]})")

            # Ask the user to input the index of the chosen title
            index = int(input("\nEnter the index of the title you were looking for: "))
            chosen_title = closest_matches[index - 1][0]

        # Get the index of the chosen title
        idx = indices[chosen_title]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top recommendations
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]

    return indices.iloc[movie_indices]



In [43]:
get_recommendations('batman').head(10)

originalTitle
Batman Returns                              618
Son of Batman                              5711
Batman Forever                              815
The Batman                                  134
Batman Beyond: Return of the Joker         5894
Batman Begins                                18
Batman: Gotham by Gaslight                 7789
Batman: The Dark Knight Returns, Part 2    3818
Batman & Robin                              808
Batman: The Killing Joke                   3590
dtype: int64