In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import sys
sys.path.append('../')

# Function that get movie recommendations based on the cosine similarity score of movie genres
def top_sims(cosine_sim, df, filter_col, movie, n):

    # Build a 1-dimensional array with movie titles
    ids = df[filter_col]
    #print(ids)
    indices = pd.Series(meta.index, index=meta[filter_col])
    idx = indices[movie]
    
    # Calculate Top N Similarity Scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movie_indices

# Function which finds the most similar movies for every movie in dataframe
def find_similarities(train_file, similarity_file, filter_col, sim_column, n):
    # Training/DF we were given
    movies = pd.read_csv(train_file)
    movie_ids = movies.movie.unique() #Unique ids
    
    # Make ids strings
    str_ids = map(str, movie_ids)
    str_ids = list(str_ids)

    # Secondary dataframe with more detail which is able to compute similarities
    meta = pd.read_csv(similarity_file)
    meta = meta[meta[filter_col].isin(str_ids)]
    similarity = meta[sim_column]

    # Find tf_idf for items in secondary df
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tf_idf = tf.fit_transform(similarity)
    
    # Compute cosine similarity on items in secondary df
    cosine_sim = linear_kernel(tf_idf, tf_idf)

    meta = meta.reset_index()
    
    # Find the top n most similar items for every movie that is in both our training dataframe and our 
    sim_list = []
    for movie in meta[filter_col].tolist():
        sim_list.append(top_sims(cosine_sim, meta, filter_col, movie, n))

    meta['similar'] = sim_list
    return meta[[filter_col, 'similar']]

    
    
find_similarities('../data/training.csv', '../the-movies-dataset/movies_metadata.csv', 'id', 'genres', 20)
    

Unnamed: 0,id,similar
0,862,"[179, 525, 743, 1411, 1446, 107, 871, 1260, 13..."
1,949,"[75, 82, 630, 681, 730, 947, 957, 1145, 1278, ..."
2,710,"[100, 293, 299, 314, 416, 505, 506, 507, 594, ..."
3,1408,"[356, 1265, 1696, 591, 106, 1627, 1658, 190, 2..."
4,524,"[51, 77, 109, 116, 203, 209, 214, 386, 702, 74..."
5,5,"[626, 240, 472, 278, 833, 163, 237, 277, 430, ..."
6,451,"[18, 24, 31, 76, 79, 81, 128, 153, 155, 160, 2..."
7,902,"[1216, 1499, 1726, 814, 1544, 685, 1649, 447, ..."
8,63,"[390, 456, 1161, 804, 328, 1208, 797, 306, 17,..."
9,687,"[16, 22, 42, 63, 101, 104, 133, 136, 144, 173,..."


  interactivity=interactivity, compiler=compiler, result=result)


(1736,)

<1736x105 sparse matrix of type '<class 'numpy.float64'>'
	with 22366 stored elements in Compressed Sparse Row format>

(1736, 1736)

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
5        [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
9        [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...
14       [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
15       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
17       [{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...
24       [{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...
28       [{'id': 14, 'name': 'Fantasy'}, {'id': 878, 'n...
31       [{'id': 878, 'name': 'Science Fiction'}, {'id'...
35                           [{'id': 18, 'name': 'Drama'}]
44       [{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...
46       [{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...
61       [{'id': 10402, 'name': 'Music'}, {'id': 18, 'n...
69       [{'id': 27, 'name': 'Horror'}, {'id': 28, 'nam...
81       [{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...
88       [{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...
95                           [{'id': 18, 'name': 'Drama'