In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.corpus import wordnet


In [88]:

from get_movie_features import movie_feature
    
credits_ = r'the-movies-dataset/credits.csv'
keywords = r'the-movies-dataset/keywords.csv'
links = r'the-movies-dataset/links_small.csv'
movies_metadata = r'the-movies-dataset/movies_metadata.csv'
smd = movie_feature(movies_metadata, links, credits_, keywords, more_weight_on='director')
print(smd.columns)


Index(['id', 'movieId', 'title', 'genres', 'description', 'keywords', 'cast',
       'director', 'spoken_languages', 'production_companies',
       'production_countries', 'popularity', 'year', 'vote_average',
       'vote_count', 'wr'],
      dtype='object')


In [89]:
smd = smd[['id', 'movieId', 'title', 'description']]
smd = smd.drop_duplicates()

In [90]:
smd

Unnamed: 0,id,movieId,title,description
0,862,1,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,2,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,3,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,4,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,5,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...,...
9214,159550,161944,The Last Brickmaker in America,A man must cope with the loss of his wife and ...
9215,392572,162542,Rustom,"Rustom Pavri, an honourable officer of the Ind..."
9216,402672,162672,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj..."
9217,315011,163056,Shin Godzilla,From the mind behind Evangelion comes a hit la...


### TF-IDF

In [91]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix

<9082x268124 sparse matrix of type '<class 'numpy.float64'>'
	with 539409 stored elements in Compressed Sparse Row format>

In [92]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [93]:
cosine_sim.shape

(9082, 9082)

## Evaluate

In [97]:
smd = smd.reset_index()
smd = smd.drop(columns='index')
movie_ids = smd['movieId']
indices = pd.Series(smd.index, index=smd['movieId'])

In [101]:
train_df = pd.read_csv('train-set-from-ratings/train_set.csv')

In [26]:
train_df

Unnamed: 0,userId,movieId,rating,timestamp
0,431,2863,4.0,1165548515
1,571,7173,2.0,1334343358
2,77,223,4.5,1163004353
3,580,1032,4.0,1165291033
4,624,1221,5.0,1019124147
...,...,...,...,...
74998,547,5810,3.0,1415444349
74999,418,1835,4.0,1132180632
75000,5,33679,4.0,1163374517
75001,358,905,5.0,957479957


In [117]:
def get_recommendations(user_id, cosine_sim, n_rec = 10):
    movies_id = train_df[train_df.userId == user_id].movieId 
    recommended = []
    for movie in movies_id:
        try:
            idx = indices[movie]
        except:
            # In case movie is not available in cosine_sim (or not in metadata)
            print("Something went wrong at movieId " + str(movie))
        else:
            # top 10 most similar
            sim_scores = list(enumerate(cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:11]
        
            for movie_rec in sim_scores:
                # change index -> movieId
                movie_recommended = (movie_ids.iloc[movie_rec[0]], movie_rec[1])
                recommended.append(movie_recommended)
    # create a DataFrame to store all movies recommended            
    recommended = pd.DataFrame(recommended)  
    recommended.columns = ['movieId', 'sim']
    recommended = recommended.sort_values('sim')[::-1].head(n_rec)
    recommended['userId'] = user_id
    return recommended.drop(columns='sim')

In [124]:
def full_recommend(df, cosine_sim):
    rec_df = pd.DataFrame()
    for user in df.userId.value_counts().index:
        rec_df = pd.concat([rec_df, get_recommendations(user, cosine_sim)])
    return rec_df

In [125]:
recommended_df = full_recommend(train_df, cosine_sim)

Something went wrong at movieId 96075
Something went wrong at movieId 7669
Something went wrong at movieId 26649
Something went wrong at movieId 150856
Something went wrong at movieId 720
Something went wrong at movieId 2851
Something went wrong at movieId 4051
Something went wrong at movieId 720
Something went wrong at movieId 55207
Something went wrong at movieId 108583
Something went wrong at movieId 7502
Something went wrong at movieId 73759
Something went wrong at movieId 27611
Something went wrong at movieId 77359
Something went wrong at movieId 100450
Something went wrong at movieId 94466
Something went wrong at movieId 7502
Something went wrong at movieId 27611
Something went wrong at movieId 720
Something went wrong at movieId 2851
Something went wrong at movieId 27611
Something went wrong at movieId 106642
Something went wrong at movieId 31193
Something went wrong at movieId 108979
Something went wrong at movieId 4207
Something went wrong at movieId 7669
Something went wrong 

In [126]:
recommended_df

Unnamed: 0,movieId,userId
720,7566,547
15310,7566,547
14480,26712,547
15530,26712,547
5770,50742,547
...,...,...
30,40629,484
143,3174,484
80,2815,484
60,193,484


In [127]:
from evaluate_model import evaluate

In [128]:
testset = pd.read_csv(r'/kaggle/input/train-set-from-ratings/test_set.csv')

In [129]:
testset = testset[['userId', 'movieId']]
testset

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248


In [130]:
evaluate(recommended_df, testset)

0.047690014903129664

### Count Vectorize

In [136]:
count_vec = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_vec_matrix = count_vec.fit_transform(smd['description'])
count_vec_matrix

<9082x268124 sparse matrix of type '<class 'numpy.int64'>'
	with 539409 stored elements in Compressed Sparse Row format>

In [137]:
count_cosine_sim = cosine_similarity(count_vec_matrix, count_vec_matrix)
count_cosine_sim

array([[1.        , 0.01206221, 0.        , ..., 0.        , 0.        ,
        0.01031366],
       [0.01206221, 1.        , 0.02509626, ..., 0.        , 0.01123879,
        0.01965608],
       [0.        , 0.02509626, 1.        , ..., 0.01451717, 0.01226925,
        0.        ],
       ...,
       [0.        , 0.        , 0.01451717, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.01123879, 0.01226925, ..., 0.        , 1.        ,
        0.00960961],
       [0.01031366, 0.01965608, 0.        , ..., 0.        , 0.00960961,
        1.        ]])

In [138]:
# compare with cosine_sim (tf-idf)
cosine_sim

array([[1.        , 0.00680287, 0.        , ..., 0.        , 0.        ,
        0.00477797],
       [0.00680287, 1.        , 0.01530676, ..., 0.        , 0.00175212,
        0.00368102],
       [0.        , 0.01530676, 1.        , ..., 0.00192698, 0.00221235,
        0.        ],
       ...,
       [0.        , 0.        , 0.00192698, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00175212, 0.00221235, ..., 0.        , 1.        ,
        0.00146391],
       [0.00477797, 0.00368102, 0.        , ..., 0.        , 0.00146391,
        1.        ]])

In [139]:
count_vectorized_recommended_df = full_recommend(train_df, count_cosine_sim)

Something went wrong at movieId 96075
Something went wrong at movieId 7669
Something went wrong at movieId 26649
Something went wrong at movieId 150856
Something went wrong at movieId 720
Something went wrong at movieId 2851
Something went wrong at movieId 4051
Something went wrong at movieId 720
Something went wrong at movieId 55207
Something went wrong at movieId 108583
Something went wrong at movieId 7502
Something went wrong at movieId 73759
Something went wrong at movieId 27611
Something went wrong at movieId 77359
Something went wrong at movieId 100450
Something went wrong at movieId 94466
Something went wrong at movieId 7502
Something went wrong at movieId 27611
Something went wrong at movieId 720
Something went wrong at movieId 2851
Something went wrong at movieId 27611
Something went wrong at movieId 106642
Something went wrong at movieId 31193
Something went wrong at movieId 108979
Something went wrong at movieId 4207
Something went wrong at movieId 7669
Something went wrong 

In [140]:
evaluate(count_vectorized_recommended_df, testset)

0.027123695976154993