In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from filmsyl.netflix.netflix import clean_titles
from filmsyl.data.data import get_imdb, get_netflix_example
from filmsyl.settings import *

In [2]:
imdb_df = get_imdb()
imdb_df

✅ imdb csv loaded
✅ data cleaned


Unnamed: 0,primaryTitle,startYear,runtimeMinutes,genres,titleId,title,averageRating,numVotes,Director,plot
0,Miss Jerry,1894.0,45,Romance,tt0000009,Miss Jerry,5.3,208.0,Alexander Black,The adventures of a female reporter in the 1890s.
1,The Corbett-Fitzsimmons Fight,1897.0,100,"Documentary,News,Sport",tt0000147,The Corbett-Fitzsimmons Fight,5.2,502.0,Enoch J. Rector,"This legendary fight was filmed on March 17, 1..."
2,The Story of the Kelly Gang,1906.0,70,"Action,Adventure,Biography",tt0000574,The Story of the Kelly Gang,6.0,871.0,Charles Tait,Just as Galeen and Wegener's Der Golem (1915) ...
3,The Prodigal Son,1907.0,90,Drama,tt0000591,The Prodigal Son,5.0,22.0,Michel Carré,The first feature-length motion picture produc...
11,The Life of Moses,1909.0,50,"Biography,Drama,Family",tt0001285,Forty Years in the Land of the Midian,5.4,60.0,J. Stuart Blackton,Released in five parts (The Persecution of the...
...,...,...,...,...,...,...,...,...,...,...
309924,9/11: Escape from the Towers,2018,120,Documentary,tt9914644,9/11: Escape from the Towers,8.3,155.0,Grace Chapman,Each World Trade Center tower consisted of 110...
309930,Life Without Sara Amat,2019,74,Drama,tt9914942,Life Without Sara Amat,6.7,206.0,Laura Jou,"Pep, a 13-year-old boy, is in love with a girl..."
309942,The Last White Witch,2019,97,"Comedy,Drama,Fantasy",tt9915872,The Last White Witch,6.4,9.0,Hideki Kiyota,"One day, a charming and mystical girl, Fuka, a..."
309945,Safeguard,2020,95,"Action,Adventure,Thriller",tt9916190,Safeguard,3.6,254.0,Fraser Precious,When a Japanese restaurant is extorted by the ...


In [3]:
netflix_df = get_netflix_example()
netflix_df

✅ data cleaned
✅ netflix csv loaded


Unnamed: 0,startYear,runtimeMinutes,genres,titleId,title,averageRating,numVotes,Director
5,1922.0,60.0,"Adventure,Comedy,Drama",tt0013071,The Dictator,2.5,22.0,James Cruze
9,1932.0,107.0,Drama,tt0023679,Dope,7.2,32.0,Kurt Gerron
10,1933.0,77.0,"Comedy,Romance",tt0024298,The Man from Toronto,6.4,146.0,Sinclair Hill
11,1940.0,125.0,"Comedy,Drama,War",tt0032553,The Dictator,8.4,236108.0,Charles Chaplin
12,1939.0,90.0,"Drama,Romance",tt0032884,The Outsider,5.5,49.0,Paul L. Stein
...,...,...,...,...,...,...,...,...
259,2019.0,118.0,"Biography,Comedy,Drama",tt8526872,Dolemite Is My Name,7.2,64685.0,Craig Brewer
260,2022.0,134.0,"Biography,Drama,Sport",tt8745676,The Swimmers,7.4,35500.0,Sally El Hosaini
261,2020.0,116.0,"Action,Thriller",tt8936646,Extraction,6.8,260424.0,Sam Hargrave
264,2020.0,109.0,"Biography,Drama,Music",tt9694312,Stardust,4.4,2279.0,Gabriel Range


In [4]:
movies_data = {
    'Title': ['The Matrix', 'Inception', 'Pulp Fiction', 'The Shawshank Redemption', 'The Godfather', 
              'Forrest Gump', 'The Dark Knight', 'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring', 'Titanic'],
    'Year': [1999, 2010, 1994, 1994, 1972, 1994, 2008, 1999, 2001, 1997],
    'Director': ['Lana Wachowski', 'Christopher Nolan', 'Quentin Tarantino', 'Frank Darabont', 'Francis Ford Coppola', 
                 'Robert Zemeckis', 'Christopher Nolan', 'David Fincher', 'Peter Jackson', 'James Cameron'],
    'Genre': ['Action, Sci-Fi', 'Action, Adventure, Sci-Fi', 'Crime, Drama', 'Drama', 'Crime, Drama', 
              'Drama, Romance', 'Action, Crime, Drama', 'Drama', 'Action, Adventure, Drama', 'Drama, Romance']
}

# Create DataFrame
new_movies = pd.DataFrame(movies_data)

In [5]:
new_movies

Unnamed: 0,Title,Year,Director,Genre
0,The Matrix,1999,Lana Wachowski,"Action, Sci-Fi"
1,Inception,2010,Christopher Nolan,"Action, Adventure, Sci-Fi"
2,Pulp Fiction,1994,Quentin Tarantino,"Crime, Drama"
3,The Shawshank Redemption,1994,Frank Darabont,Drama
4,The Godfather,1972,Francis Ford Coppola,"Crime, Drama"
5,Forrest Gump,1994,Robert Zemeckis,"Drama, Romance"
6,The Dark Knight,2008,Christopher Nolan,"Action, Crime, Drama"
7,Fight Club,1999,David Fincher,Drama
8,The Lord of the Rings: The Fellowship of the Ring,2001,Peter Jackson,"Action, Adventure, Drama"
9,Titanic,1997,James Cameron,"Drama, Romance"


In [6]:
imdb_df['text_features'] = imdb_df['genres'] + ' ' + imdb_df['Director']+ ' ' + imdb_df['plot']
#imdb_df_preprocess=imdb_df.drop(columns=['genres','Director','averageRating','titleId','startYear','numVotes','runtimeMinutes'])

In [10]:
netflix_df['text_features'] = netflix_df['genres'] + ' ' + netflix_df['Director']

In [13]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
#Replace NaN with an empty string
imdb_df['text_features'] = imdb_df['text_features'].fillna('')
tfidf_matrix = tfidf.fit_transform(imdb_df['text_features'])
netflix_tfidf = tfidf.transform(netflix_df['text_features'])
cosine_sim_netflix = linear_kernel(netflix_tfidf, tfidf_matrix)
mean_similarity = np.mean(cosine_sim_netflix, axis=0)
imdb_df['mean_similarity'] = mean_similarity
sorted_df = imdb_df.sort_values(by='mean_similarity', ascending=False)
new_= imdb_df[imdb_df['primaryTitle'].isin(new_movies['Title'])]
new_.sort_values(by='mean_similarity', ascending=False)['primaryTitle']


139309                                            Inception
94911                                       The Dark Knight
34439                                         The Godfather
54170     The Lord of the Rings: The Fellowship of the Ring
50500                                          Pulp Fiction
156964                                           Fight Club
58310                                            Fight Club
50104                                          Forrest Gump
24397                                               Titanic
53954                                               Titanic
50607                              The Shawshank Redemption
301329                                              Titanic
Name: primaryTitle, dtype: object

In [None]:
import pandas as pd
import numpy as np
from filmsyl.settings import *
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from filmsyl.netflix.netflix import clean_titles
from filmsyl.data.data import get_imdb, get_netflix_example

def preprocess_data(imdb_df, netflix_df):
    """
    Preprocesses IMDb and Netflix DataFrames by combining relevant columns,
    filling missing values, and creating 'text_features' column.
    
    Parameters:
        imdb_df (pd.DataFrame): DataFrame containing IMDb data.
        netflix_df (pd.DataFrame): DataFrame containing Netflix data.
    
    Returns:
        pd.DataFrame, pd.DataFrame: Preprocessed IMDb and Netflix DataFrames.
    """
    # Combine relevant columns and fill missing values for IMDb DataFrame
    imdb_df['text_features'] = imdb_df['genres'] + ' ' + imdb_df['Director'] + ' ' + imdb_df['plot']
    imdb_df['text_features'] = imdb_df['text_features'].fillna('')
    
    # Combine relevant columns and fill missing values for Netflix DataFrame
    netflix_df['text_features'] = netflix_df['genres'] + ' ' + netflix_df['Director']
    netflix_df['text_features'] = netflix_df['text_features'].fillna('')
    
    return imdb_df, netflix_df

def calculate_similarity(imdb_df, netflix_df):
    """
    Calculates similarity between IMDb and Netflix movies using TF-IDF vectorization and cosine similarity.
    
    Parameters:
        imdb_df (pd.DataFrame): Preprocessed IMDb DataFrame.
        netflix_df (pd.DataFrame): Preprocessed Netflix DataFrame.
    
    Returns:
        pd.DataFrame: IMDb DataFrame with 'mean_similarity' column added.
    """
    # Define TF-IDF vectorizer
    tfidf = TfidfVectorizer(stop_words='english')
    
    # Fit and transform IMDb text features
    tfidf_matrix = tfidf.fit_transform(imdb_df['text_features'])
    
    # Transform Netflix text features
    netflix_tfidf = tfidf.transform(netflix_df['text_features'])
    
    # Compute cosine similarity between Netflix and IMDb movies
    cosine_sim_netflix = linear_kernel(netflix_tfidf, tfidf_matrix)
    
    # Calculate mean similarity across all Netflix movies
    mean_similarity = np.mean(cosine_sim_netflix, axis=0)
    
    # Add 'mean_similarity' column to IMDb DataFrame
    imdb_df['mean_similarity'] = mean_similarity
    
    return imdb_df

def get_movie_recommendation(amount: int, imdb_df, netflix_df, new_movies):
    """
    Get movie recommendations based on IMDb and Netflix data.
    
    Parameters:
        amount (int): Number of movie recommendations to return.
        imdb_df (pd.DataFrame): DataFrame containing IMDb data.
        netflix_df (pd.DataFrame): DataFrame containing Netflix data.
        new_movies (pd.DataFrame): DataFrame containing new movies data.
    
    Returns:
        pd.Series: Series containing recommended movie titles.
    """
    # Preprocess IMDb and Netflix DataFrames
    imdb_df, netflix_df = preprocess_data(imdb_df, netflix_df)
    
    # Calculate similarity between IMDb and Netflix movies
    imdb_df = calculate_similarity(imdb_df, netflix_df)
    
    # If new_movies list is empty, recommend top movies based on mean similarity
    if new_movies.empty:
        return imdb_df.sort_values(by='mean_similarity', ascending=False)['primaryTitle'].head(amount)
    
    # Otherwise, recommend movies based on new_movies
    new_df = imdb_df[imdb_df['primaryTitle'].isin(new_movies['Title'])]
    return new_df.sort_values(by='mean_similarity', ascending=False)['primaryTitle'].head(amount)

if __name__ == "__main__":
    # Example usage
    imdb_df = get_imdb()
    netflix_df = get_netflix_example()
    new_movies = ...  # Define new_movies DataFrame
    amount = int(input("Enter the number of movies you want to be recommended: "))
    recommendations = get_movie_recommendation(amount, imdb_df, netflix_df, new_movies)
    print(recommendations)


In [None]:
# get_rec(5,imdb_df,netflix_df,new_movies)