In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv("Movie.csv",sep=';')
movies_df.head(11)

Unnamed: 0,movie_id,genres
0,1,"Comedy,Drama"
1,2,"Drama,Sci-fi"
2,3,"Adventure,Drama"
3,4,"Action,War"
4,5,"Action,War"
5,6,"Action,Drama"
6,7,"Adventure,Sci-fi"
7,8,"Adventure,Sci-fi"
8,9,"Action,Sci-fi"
9,10,"Action,Adventure"


In [3]:
user_df = pd.DataFrame({"movie_id":[13,3,4,1,5,2,2,2,5,2]})
user_df

Unnamed: 0,movie_id
0,13
1,3
2,4
3,1
4,5
5,2
6,2
7,2
8,5
9,2


In [4]:
movies_df_merge = movies_df.merge(user_df, on='movie_id')
movies_df_merge.head(10)

Unnamed: 0,movie_id,genres
0,1,"Comedy,Drama"
1,2,"Drama,Sci-fi"
2,2,"Drama,Sci-fi"
3,2,"Drama,Sci-fi"
4,2,"Drama,Sci-fi"
5,3,"Adventure,Drama"
6,4,"Action,War"
7,5,"Action,War"
8,5,"Action,War"
9,13,"Action,Sci-fi"


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

In [12]:
movies_df_merge.drop_duplicates(subset = 'movie_id',inplace=True)
movies_df_merge.reset_index(drop=True, inplace=True)
movies_df_merge.head(10)

Unnamed: 0,movie_id,genres
0,1,"Comedy,Drama"
1,2,"Drama,Sci-fi"
2,3,"Adventure,Drama"
3,4,"Action,War"
4,5,"Action,War"
5,13,"Action,Sci-fi"


In [7]:
# Fitting the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(movies_df_merge['genres'])

In [8]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [9]:
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_df_merge.index, index=movies_df_merge['movie_id'])
indices

movie_id
1     0
2     1
2     2
2     3
2     4
3     5
4     6
5     7
5     8
13    9
dtype: int64

In [10]:
def give_rec(movie_id, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[movie_id]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_df_merge['movie_id'].iloc[movie_indices]

In [11]:
give_rec(3)

5     3
1     2
2     2
3     2
4     2
6     4
7     5
8     5
9    13
Name: movie_id, dtype: int64