In [9]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import process
import ast

# Load the movie and credits data
movies = pd.read_csv('datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('datasets/tmdb_5000_credits.csv')

# Rename 'movie_id' to 'id' in the credits dataframe to match the movies dataframe
credits = credits.rename(columns={'movie_id': 'id'})

# Merge the movies dataframe with the credits dataframe on 'id'
movies = movies.merge(credits, on='id')

# Fill any missing values in the 'genres' and 'cast' columns
movies['genres'] = movies['genres'].fillna('')
movies['cast'] = movies['cast'].fillna('')

# Convert genres from string to list
movies['genres'] = movies['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

# Convert cast from string to list (first 3 actors for simplicity)
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)] if isinstance(x, str) else [])

# Combine genres and cast into a single string (for each movie)
movies['combined_features'] = movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + movies['cast'].apply(lambda x: ' '.join(x))
# Drop 'title_y' column and rename 'title_x' to 'title'
movies = movies.drop(columns=['title_y'])
movies = movies.rename(columns={'title_x': 'title'})
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit the vectorizer on the 'combined_features' column
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Use KNN to find the most similar movies based on TF-IDF features
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(tfidf_matrix)

# Function to get movie recommendations using KNN
def recommend_movie_knn(movie_title, knn, movies_df, tfidf_matrix, top_n=5):
    # Use fuzzy matching to find the best match for the movie title
    best_match, score = process.extractOne(movie_title, movies_df['title'].values)
    
    if score < 70:  # Threshold to avoid incorrect matches
        print(f"Movie '{movie_title}' not found or doesn't match closely enough.")
        return None  # Return None if no good match is found
    
    # Get the index of the matched movie
    movie_index = movies_df[movies_df['title'] == best_match].index[0]
    
    # Find the nearest neighbors for the movie index
    distances, indices = knn.kneighbors(tfidf_matrix[movie_index], n_neighbors=top_n+1)
    
    # Get the recommended movie titles (skip the first one as it's the movie itself)
    recommended_movie_titles = movies_df['title'].iloc[indices[0][1:]]
    
    return recommended_movie_titles

# Example: Recommend 5 movies similar to 'The Dark Knight' using KNN
recommended_movies = recommend_movie_knn('spiderman', knn, movies, tfidf_matrix, top_n=5)
print(recommended_movies)


30            Spider-Man 2
5             Spider-Man 3
2944      Army of Darkness
1868    Cradle 2 the Grave
382             Seabiscuit
Name: title, dtype: object


<bound method NDFrame.head of          budget                                         genres  \
0     237000000  [Action, Adventure, Fantasy, Science Fiction]   
1     300000000                   [Adventure, Fantasy, Action]   
2     245000000                     [Action, Adventure, Crime]   
3     250000000               [Action, Crime, Drama, Thriller]   
4     260000000           [Action, Adventure, Science Fiction]   
...         ...                                            ...   
4798     220000                      [Action, Crime, Thriller]   
4799       9000                              [Comedy, Romance]   
4800          0             [Comedy, Drama, Romance, TV Movie]   
4801          0                                             []   
4802          0                                  [Documentary]   

                                               homepage      id  \
0                           http://www.avatarmovie.com/   19995   
1          http://disney.go.com/disneypictu