In [2]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [3]:
#import IMDb movie csv
csv = os.path.join("/home/lambert/filmsyl/filmsyl/raw_data", "imdb_movies.csv")
imdb_df = pd.read_csv(csv)
imdb_df.replace({'\\N': np.nan, '': np.nan}, inplace=True)
imdb_df

Unnamed: 0,startYear,runtimeMinutes,genres,titleId,title,averageRating,numVotes,Director
0,1894.0,45.0,Romance,tt0000009,Miss Jerry,5.3,208.0,Alexander Black
1,1897.0,100.0,"Documentary,News,Sport",tt0000147,The Corbett-Fitzsimmons Fight,5.2,502.0,Enoch J. Rector
2,1906.0,70.0,"Action,Adventure,Biography",tt0000574,The Story of the Kelly Gang,6.0,871.0,Charles Tait
3,1907.0,90.0,Drama,tt0000591,The Prodigal Son,5.0,22.0,Michel Carré
4,1908.0,,Drama,tt0000630,Hamlet,2.9,27.0,Mario Caserini
...,...,...,...,...,...,...,...,...
309802,,,Thriller,tt9916188,Minotaur,,,Dean Israelite
309803,2020.0,95.0,"Action,Adventure,Thriller",tt9916190,Safeguard,3.6,254.0,Fraser Precious
309804,2020.0,92.0,"Drama,History",tt9916362,Coven,6.4,5738.0,Pablo Agüero
309805,2019.0,,"Adventure,History,War",tt9916428,The Secret of China,3.4,18.0,Jixing Wang


In [4]:
#import Netflix movie csv
csv_ = os.path.join("/home/lambert/filmsyl/filmsyl/raw_data", "jakob_movies.csv")
netflix_df = pd.read_csv(csv_)
netflix_df.replace({'\\N': np.nan, '': np.nan}, inplace=True)
netflix_df

Unnamed: 0,startYear,runtimeMinutes,genres,titleId,title,averageRating,numVotes,Director
0,1914.0,,Drama,tt0003859,Dope,,,Herman Lieb
1,1915.0,,"Adventure,Romance",tt0005196,The Dictator,7.7,48.0,Oscar Eagle
2,1915.0,50.0,Western,tt0005200,The Disciple,6.2,47.0,
3,1920.0,60.0,Drama,tt0011467,The Outsider,,,Edmund Mortimer
4,1921.0,60.0,Western,tt0012350,The Killer,6.9,23.0,
...,...,...,...,...,...,...,...,...
261,2020.0,116.0,"Action,Thriller",tt8936646,Extraction,6.8,260424.0,Sam Hargrave
262,,,"Horror,Thriller",tt9076574,Lift,,,Jeff Kapp
263,2021.0,90.0,Horror,tt9280562,The Killer,,,
264,2020.0,109.0,"Biography,Drama,Music",tt9694312,Stardust,4.4,2279.0,Gabriel Range


In [5]:
netflix_df.dropna(inplace=True)
imdb_df.dropna(inplace=True)

In [6]:
#combine imdb text features. 
imdb_df['text_features']= imdb_df['genres'] + ' ' + imdb_df['Director']
imdb_df['text_features']

0                           Romance Alexander Black
1            Documentary,News,Sport Enoch J. Rector
2           Action,Adventure,Biography Charles Tait
3                                Drama Michel Carré
11        Biography,Drama,Family J. Stuart Blackton
                            ...                    
309788                              Drama Laura Jou
309800           Comedy,Drama,Fantasy Hideki Kiyota
309801                        Drama Tamar Guimaraes
309803    Action,Adventure,Thriller Fraser Precious
309804                   Drama,History Pablo Agüero
Name: text_features, Length: 145598, dtype: object

In [7]:
imdb_dfnew=imdb_df.drop(columns=['genres','Director','averageRating','titleId','startYear','numVotes','runtimeMinutes'])
imdb_dfnew

Unnamed: 0,title,text_features
0,Miss Jerry,Romance Alexander Black
1,The Corbett-Fitzsimmons Fight,"Documentary,News,Sport Enoch J. Rector"
2,The Story of the Kelly Gang,"Action,Adventure,Biography Charles Tait"
3,The Prodigal Son,Drama Michel Carré
11,Forty Years in the Land of the Midian,"Biography,Drama,Family J. Stuart Blackton"
...,...,...
309788,Life Without Sara Amat,Drama Laura Jou
309800,The Last White Witch,"Comedy,Drama,Fantasy Hideki Kiyota"
309801,The Rehearsal,Drama Tamar Guimaraes
309803,Safeguard,"Action,Adventure,Thriller Fraser Precious"


In [8]:
netflix_df['text_features'] = netflix_df['title'] + ' ' + netflix_df['genres'] + ' ' + netflix_df['Director']

In [9]:
netflix_dfnew=netflix_df.drop(columns=['genres','Director','averageRating','titleId','startYear','numVotes','runtimeMinutes'])

In [10]:
netflix_dfnew

Unnamed: 0,title,text_features
5,The Dictator,"The Dictator Adventure,Comedy,Drama James Cruze"
9,Dope,Dope Drama Kurt Gerron
10,The Man from Toronto,"The Man from Toronto Comedy,Romance Sinclair Hill"
11,The Dictator,"The Dictator Comedy,Drama,War Charles Chaplin"
12,The Outsider,"The Outsider Drama,Romance Paul L. Stein"
...,...,...
259,Dolemite Is My Name,"Dolemite Is My Name Biography,Comedy,Drama Cra..."
260,The Swimmers,"The Swimmers Biography,Drama,Sport Sally El Ho..."
261,Extraction,"Extraction Action,Thriller Sam Hargrave"
264,Stardust,"Stardust Biography,Drama,Music Gabriel Range"


In [11]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

In [12]:
# Fit and transform the text data for IMDb
imdb_text_matrix = vectorizer.fit_transform(imdb_df['text_features'])
# Fit and transform the text data for Netflix
netflix_text_matrix = vectorizer.transform(netflix_df['text_features'])

In [25]:
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(imdb_text_matrix)

In [68]:
# Preprocess the text features of the new movie
new_movie = "Drugs, and family James Wan "  # Example text features of the new movie
new_movie_text_matrix = vectorizer.transform([new_movie])

# Nearest neighbors for the new movie
distances, indices = knn_model.kneighbors(new_movie_text_matrix)

# Get the indices of the nearest neighbors in the IMDb dataset
nearest_neighbor_indices = indices[0]

# Get the title of the suggested movie
suggested_movie_title = imdb_df.iloc[nearest_neighbor_indices]['title'].values[0]
print("IMDb Suggestion:", suggested_movie_title)


IMDb Suggestion: Peril for the Guy
