In [118]:
%matplotlib inline
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


In [119]:
import numpy as np 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import warnings; warnings.simplefilter('ignore')

def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def filter_keywords(x, s):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

cols = ['id', 'movieId', 'title', 'genres', 'description', 'keywords', \
        'cast', 'director', 'spoken_languages', 'production_companies',\
        'production_countries', 'popularity', 'year', 'vote_average',\
         'vote_count', 'wr']

def read_dataset(metadata_path, links_small_path, credits_path,keywords_path):
    meta = pd.read_csv(metadata_path)
    links_small = pd.read_csv(links_small_path)
    credits = pd.read_csv(credits_path)
    keywords = pd.read_csv(keywords_path)
    return meta, links_small, credits, keywords

def cal_weighted_rating(meta, percentile=0.95):
    vote_counts = meta[meta['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = meta[meta['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    meta = meta[(meta['vote_count'].notnull()) \
                     & (meta['vote_average'].notnull())]
    meta['vote_count'] = meta['vote_count'].astype('int')
    meta['vote_average'] = meta['vote_average'].astype('int')
    meta['wr'] = meta.apply(lambda x: weighted_rating(x, m, C), axis=1)

    return meta
    
def movie_feature(metadata_path, links_small_path, credits_path,keywords_path, \
                  percentile=0.95, more_weight_on = None, \
                    stemmer = SnowballStemmer('english'), cols=cols):
    # read dataset
    meta, links_small, credits, keywords = read_dataset(metadata_path, \
        links_small_path, credits_path,keywords_path)

    # change type + drop
    links_small = links_small[links_small['tmdbId'].notnull()]
    links_small['tmdbId'] = links_small['tmdbId'].astype('int')

    meta = meta.drop([19730, 29503, 35587])
    meta['popularity'] = meta[meta['popularity'].notnull()]['popularity'].astype('float')
    meta['id'] = meta['id'].astype('int')
    meta['year'] = pd.to_datetime(meta['release_date'], errors='coerce').apply(\
        lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
    meta = meta[meta.production_companies.notnull()]
    keywords['id'] = keywords['id'].astype('int')
    credits['id'] = credits['id'].astype('int')

    # calcualte weighted rating for movies
    meta = cal_weighted_rating(meta)

    # merge meta + link small => create a smaller dataset for recommend
    smd = meta[meta['id'].isin(links_small['tmdbId'])]

    # create description feature
    smd['tagline'] = smd['tagline'].fillna('')
    smd['description'] = smd['overview'] + smd['tagline']
    smd['description'] = smd['description'].fillna('')
    
    # merge credit + keywords + links_small
    smd = smd.merge(credits, on='id')
    smd = smd.merge(keywords, on='id')
    smd = smd.merge(links_small, left_on='id', right_on='tmdbId')

    # feature engineering
    literal_features = ['cast', 'spoken_languages', 'genres', 'keywords',\
                        'production_companies', 'production_countries']
    for fearture in literal_features:
        smd[fearture] = smd[fearture].apply(literal_eval)
        smd[fearture] = smd[fearture].apply(lambda x: [i['name'] \
                                            for i in x] if isinstance(x, list) else [])
    smd['crew'] = smd['crew'].apply(literal_eval)
    smd['director'] = smd['crew'].apply(get_director)

    # top 3 actors
    smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
   
    # Strip Spaces and Convert to Lowercase 
    smd['cast'] = smd['cast'].apply(lambda x: [str.lower(\
        i.replace(" ", "")) for i in x])
    smd['director'] = smd['director'].astype('str').apply(\
        lambda x: str.lower(x.replace(" ", "")))
    if more_weight_on:
        smd[more_weight_on] = smd[more_weight_on].apply(lambda x: [x,x,x])
    
    # choose keywords appear more than once + stemming
    s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
    s = s.value_counts()
    s = s[s > 1]
    smd['keywords'] = smd['keywords'].apply(lambda x: filter_keywords(x, s))
    smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
    smd['keywords'] = smd['keywords'].apply(lambda x: \
                                            [str.lower(i.replace(" ", "")) for i in x])

    return smd[cols]
    


In [120]:
credits_ = r'/kaggle/input/the-movies-dataset/credits.csv'
keywords = r'/kaggle/input/the-movies-dataset/keywords.csv'
links = r'/kaggle/input/the-movies-dataset/links_small.csv'
movies_metadata = r'/kaggle/input/the-movies-dataset/movies_metadata.csv'
smd = movie_feature(movies_metadata, links, credits_, keywords)
# print(smd.columns)

In [122]:
smd["description"] = smd["description"] + " " + \
    smd["title"] + " " + \
    smd['keywords'].apply(lambda x: " ".join(map(str, x))) + " " + \
    smd['production_companies'].apply(lambda x: " ".join(map(str, x))) + " " + \
    smd['genres'].apply(lambda x: " ".join(map(str, x))) + " " + \
    smd['cast'].apply(lambda x: " ".join(map(str, x))) + " " + \
    smd['director'].apply(lambda x: " ".join(map(str, x))) + " " + \
    smd['spoken_languages'].apply(lambda x: " ".join(map(str, x))) + " " + \
    smd['production_countries'].apply(lambda x: " ".join(map(str, x)))

In [123]:
smd

Unnamed: 0,id,movieId,title,genres,description,keywords,cast,director,spoken_languages,production_companies,production_countries,popularity,year,vote_average,vote_count,wr
0,862,1,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...","[jealousi, toy, boy, friendship, friend, rival...","[tomhanks, timallen, donrickles]",johnlasseter,[English],[Pixar Animation Studios],[United States of America],21.946943,1995,7,5415,6.869770
1,8844,2,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[boardgam, disappear, basedonchildren'sbook, n...","[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[English, Français]","[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],17.015539,1995,6,2413,5.884891
2,15602,3,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,"[fish, bestfriend, duringcreditssting]","[waltermatthau, jacklemmon, ann-margret]",howarddeutch,[English],"[Warner Bros., Lancaster Gate]",[United States of America],11.712900,1995,6,92,5.376968
3,31357,4,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...","[basedonnovel, interracialrelationship, single...","[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker,[English],[Twentieth Century Fox Film Corporation],[United States of America],3.859495,1995,6,34,5.299755
4,11862,5,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,"[babi, midlifecrisi, confid, age, daughter, mo...","[stevemartin, dianekeaton, martinshort]",charlesshyer,[English],"[Sandollar Productions, Touchstone Pictures]",[United States of America],8.387519,1995,5,173,5.175099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9214,159550,161944,The Last Brickmaker in America,[Drama],A man must cope with the loss of his wife and ...,[friendship],"[sidneypoitier, wendycrewson, jayo.sanders]",greggchampion,[],[Nasser Entertainment],[United States of America],0.038998,2001,7,1,5.248931
9215,392572,162542,Rustom,"[Thriller, Romance]","Rustom Pavri, an honourable officer of the Ind...",[bollywood],"[akshaykumar, ileanad'cruz, eshagupta]",tinusureshdesai,[हिन्दी],[KriArj Entertainment],[India],7.333139,2016,7,25,5.340490
9216,402672,162672,Mohenjo Daro,"[Adventure, Drama, History, Romance]","Village lad Sarman is drawn to big, bad Mohenj...",[bollywood],"[hrithikroshan, poojahegde, kabirbedi]",ashutoshgowariker,[हिन्दी],"[UTV Motion Pictures, Ashutosh Gowariker Produ...",[India],1.423358,2016,6,26,5.287576
9217,315011,163056,Shin Godzilla,"[Action, Adventure, Drama, Horror, Science Fic...",From the mind behind Evangelion comes a hit la...,"[monster, godzilla, giantmonst, destruct, kaiju]","[hirokihasegawa, yutakatakenouchi, satomiishih...",hideakianno,"[Italiano, Deutsch, English, 日本語]","[Cine Bazar, Toho Pictures]",[Japan],9.285519,2016,6,152,5.440760


In [124]:
smd["description"][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Toy Story jealousi toy boy friendship friend rivalri boynextdoor newtoy toycomestolif Pixar Animation Studios Animation Comedy Family tomhanks timallen donrickles j o h n l a s s e t e r English United States of America"

In [125]:
smd = smd[['id', 'movieId', 'title', 'description']]
smd = smd.drop_duplicates()

In [126]:
smd

Unnamed: 0,id,movieId,title,description
0,862,1,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,2,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,3,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,4,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,5,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...,...
9214,159550,161944,The Last Brickmaker in America,A man must cope with the loss of his wife and ...
9215,392572,162542,Rustom,"Rustom Pavri, an honourable officer of the Ind..."
9216,402672,162672,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj..."
9217,315011,163056,Shin Godzilla,From the mind behind Evangelion comes a hit la...


### TF-IDF

In [127]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix

<9082x410059 sparse matrix of type '<class 'numpy.float64'>'
	with 955202 stored elements in Compressed Sparse Row format>

In [128]:
tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [129]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [130]:
cosine_sim.shape

(9082, 9082)

In [131]:
smd = smd.reset_index()
smd = smd.drop(columns='index')
movie_ids = smd['movieId']
indices = pd.Series(smd.index, index=smd['movieId'])

## Evaluate

In [132]:
train_df = pd.read_csv('/kaggle/input/train-set-from-ratings/train_set.csv')

In [133]:
train_df

Unnamed: 0,userId,movieId,rating,timestamp
0,431,2863,4.0,1165548515
1,571,7173,2.0,1334343358
2,77,223,4.5,1163004353
3,580,1032,4.0,1165291033
4,624,1221,5.0,1019124147
...,...,...,...,...
74998,547,5810,3.0,1415444349
74999,418,1835,4.0,1132180632
75000,5,33679,4.0,1163374517
75001,358,905,5.0,957479957


In [134]:
def get_top_n_similar(recommended, movie, cosine_sim, n_rec = 10):
    try:
        idx = indices[movie]
    except:
        # In case movie is not available in cosine_sim (or not in metadata)
        pass
    else:
        # top 10 most similar
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:n_rec+1]

        for movie_rec in sim_scores:
            # change index -> movieId
            movie_recommended = (movie_ids.iloc[movie_rec[0]], movie_rec[1])
            recommended.append(movie_recommended)
    return recommended

In [135]:
def get_recommendations(user_id, cosine_sim, n_rec = 10):
    movies_id = train_df[train_df.userId == user_id].movieId 
    recommended = []
    for movie in movies_id:
        recommended = get_top_n_similar(recommended, movie, cosine_sim, n_rec = 10)
            
    # create a DataFrame to store all movies recommended            
    recommended = pd.DataFrame(recommended)  
    recommended.columns = ['movieId', 'sim']
    
    recommended = recommended[~recommended['movieId'].isin(movies_id)].groupby('movieId').max().reset_index()
    recommended = recommended.sort_values('sim')[::-1].head(n_rec)
    recommended['userId'] = user_id
    return recommended.drop(columns='sim')

In [136]:
def full_recommend(df, cosine_sim):
    rec_df = pd.DataFrame()
    for user in df.userId.value_counts().index:
        rec_df = pd.concat([rec_df, get_recommendations(user, cosine_sim)])
    return rec_df

### Example

In [137]:
train_df

Unnamed: 0,userId,movieId,rating,timestamp
0,431,2863,4.0,1165548515
1,571,7173,2.0,1334343358
2,77,223,4.5,1163004353
3,580,1032,4.0,1165291033
4,624,1221,5.0,1019124147
...,...,...,...,...
74998,547,5810,3.0,1415444349
74999,418,1835,4.0,1132180632
75000,5,33679,4.0,1163374517
75001,358,905,5.0,957479957


In [138]:
smd[smd.title == 'The Dark Knight']

Unnamed: 0,id,movieId,title,description
6897,155,58559,The Dark Knight,Batman raises the stakes in his war on crime. ...


In [139]:
recommend_for_film = get_top_n_similar([], 58559, cosine_sim)
recommend_for_film

[(91529, 0.26710650034043243),
 (33794, 0.23100508678860393),
 (99813, 0.16888459847053983),
 (79274, 0.16348267245186485),
 (1377, 0.162766976281607),
 (153, 0.14269233115257812),
 (98124, 0.13763066230051446),
 (592, 0.1335024599374849),
 (3213, 0.13139099796677545),
 (136864, 0.1205529189193184)]

In [140]:
smd[smd["movieId"].isin(pd.DataFrame(get_top_n_similar([], 58559, cosine_sim))[0])]

Unnamed: 0,id,movieId,title,description
132,414,153,Batman Forever,The Dark Knight of Gotham City confronts a das...
524,268,592,Batman,The Dark Knight of Gotham City begins his war ...
1113,364,1377,Batman Returns,"Having defeated the Joker, Batman now faces th..."
2578,14919,3213,Batman: Mask of the Phantasm,An old flame of Bruce Wayne's strolls into tow...
6141,272,33794,Batman Begins,"Driven by tragedy, billionaire Bruce Wayne ded..."
7561,40662,79274,Batman: Under the Red Hood,Batman faces his ultimate challenge as the mys...
7926,49026,91529,The Dark Knight Rises,Following the death of District Attorney Harve...
8160,123025,98124,"Batman: The Dark Knight Returns, Part 1",Batman has not been seen for ten years. A new ...
8222,142061,99813,"Batman: The Dark Knight Returns, Part 2",Batman has stopped the reign of terror that Th...
8905,209112,136864,Batman v Superman: Dawn of Justice,Fearing the actions of a god-like Super Hero l...


In [141]:
train_df[train_df['movieId'] == 58559]

Unnamed: 0,userId,movieId,rating,timestamp
514,106,58559,5.0,1216958340
747,91,58559,5.0,1448798116
836,630,58559,3.0,1443808760
1596,62,58559,5.0,1475948734
1819,622,58559,5.0,1424224970
...,...,...,...,...
68393,355,58559,5.0,1231028576
68412,601,58559,4.5,1270254671
70937,379,58559,4.0,1378179986
71698,46,58559,5.0,1366390187


In [142]:
for user in train_df[train_df['movieId'] == 58559]['userId']:
    print(user, ': ', train_df[train_df['userId'] == user].shape[0])

106 :  34
91 :  113
630 :  17
62 :  40
622 :  23
473 :  57
136 :  37
3 :  38
457 :  535
273 :  69
478 :  56
362 :  89
244 :  55
546 :  49
176 :  192
547 :  1793
203 :  28
566 :  17
94 :  147
600 :  26
423 :  260
213 :  683
454 :  20
384 :  364
152 :  164
250 :  116
352 :  63
524 :  48
572 :  79
426 :  191
97 :  96
570 :  96
212 :  657
632 :  29
483 :  89
392 :  19
456 :  50
125 :  157
275 :  152
388 :  594
402 :  239
26 :  129
563 :  119
42 :  53
481 :  327
574 :  256
623 :  77
365 :  52
277 :  43
580 :  691
378 :  110
138 :  61
104 :  57
101 :  41
316 :  105
450 :  90
199 :  316
660 :  69
479 :  74
56 :  392
61 :  124
624 :  1301
542 :  47
380 :  797
503 :  33
255 :  109
298 :  56
583 :  15
515 :  25
287 :  190
314 :  25
72 :  143
615 :  289
297 :  104
226 :  27
324 :  113
13 :  40
270 :  172
149 :  173
31 :  52
186 :  32
521 :  37
355 :  231
601 :  35
379 :  53
46 :  29
147 :  28


Choose user 583, 630 to take example due to the small number of films they watched

In [143]:
get_recommendations(583, cosine_sim)

Unnamed: 0,movieId,userId
27,2115,583
81,33794,583
17,1291,583
93,59615,583
122,99813,583
19,1377,583
110,79274,583
6,153,583
36,3213,583
119,98124,583


In [157]:
smd[smd.movieId.isin(get_recommendations(583, cosine_sim).movieId)]

Unnamed: 0,id,movieId,title,description
132,414,153,Batman Forever,The Dark Knight of Gotham City confronts a das...
1041,89,1291,Indiana Jones and the Last Crusade,When Dr. Henry Jones Sr. suddenly goes missing...
1113,364,1377,Batman Returns,"Having defeated the Joker, Batman now faces th..."
1671,87,2115,Indiana Jones and the Temple of Doom,"After arriving in India, Indiana Jones is aske..."
2578,14919,3213,Batman: Mask of the Phantasm,An old flame of Bruce Wayne's strolls into tow...
6141,272,33794,Batman Begins,"Driven by tragedy, billionaire Bruce Wayne ded..."
6940,217,59615,Indiana Jones and the Kingdom of the Crystal S...,"Set during the Cold War, the Soviets – led by ..."
7561,40662,79274,Batman: Under the Red Hood,Batman faces his ultimate challenge as the mys...
8160,123025,98124,"Batman: The Dark Knight Returns, Part 1",Batman has not been seen for ten years. A new ...
8222,142061,99813,"Batman: The Dark Knight Returns, Part 2",Batman has stopped the reign of terror that Th...


In [158]:
smd[smd.movieId.isin(get_recommendations(630, cosine_sim).movieId)]

Unnamed: 0,id,movieId,title,description
1569,165,2011,Back to the Future Part II,Marty and Doc are at it again in this wacky se...
2501,863,3114,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy..."
4595,604,6365,The Matrix Reloaded,Six months after the events depicted in The Ma...
4872,605,6934,The Matrix Revolutions,The human city of Zion defends itself against ...
6141,272,33794,Batman Begins,"Driven by tragedy, billionaire Bruce Wayne ded..."
7531,10193,78499,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven..."
7561,40662,79274,Batman: Under the Red Hood,Batman faces his ultimate challenge as the mys...
7926,49026,91529,The Dark Knight Rises,Following the death of District Attorney Harve...
8222,142061,99813,"Batman: The Dark Knight Returns, Part 2",Batman has stopped the reign of terror that Th...
8407,213121,106022,Toy Story of Terror!,What starts out as a fun road trip for the Toy...


In [145]:
recommended_df = full_recommend(train_df, cosine_sim)

In [146]:
recommended_df

Unnamed: 0,movieId,userId
3716,50742,547
2444,5675,547
2687,6564,547
3059,8360,547
4196,74510,547
...,...,...
43,2457,484
19,852,484
32,1632,484
116,35836,484


In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split

def split_data(rating_df):
    train_df, test_df = train_test_split(rating_df, \
                                         test_size=0.2, stratify=rating_df['userId'], \
                                        random_state=42)
    return train_df, test_df

def check_movieId(pred_df, val_df):
    result = pred_df['movieId'].isin(val_df[val_df['userId'] == \
                                   pred_df['userId'].iloc[0]]['movieId'])
    return result.reset_index(drop=True)

def evaluate(pred_df, val_df):
    """ Proportion of movies recommended that were actually watched by users

    Args:
    pred_df: dataframe, 2 columns: userId, movieId
    val_df: dataframe, 2 columns: userId, movieId

    Returns:
    
    """
    result = pred_df.groupby('userId').apply(lambda x: check_movieId(x, val_df))
    result = result.groupby('userId').sum() / result.groupby('userId').count()

    return result.mean().mean()


    

In [148]:
testset = pd.read_csv(r'/kaggle/input/train-set-from-ratings/test_set.csv')

In [149]:
testset = testset[['userId', 'movieId']]
testset

Unnamed: 0,userId,movieId
0,302,593
1,191,110
2,457,5337
3,239,1042
4,292,5377
...,...,...
24996,353,34319
24997,428,743
24998,595,1097
24999,664,61248


In [150]:
evaluate(recommended_df, testset)

0.09090909090909091

### Count Vectorize

In [151]:
count_vec = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_vec_matrix = count_vec.fit_transform(smd['description'])
count_vec_matrix

<9082x410059 sparse matrix of type '<class 'numpy.int64'>'
	with 955202 stored elements in Compressed Sparse Row format>

In [152]:
count_cosine_sim = cosine_similarity(count_vec_matrix, count_vec_matrix)
count_cosine_sim

array([[1.        , 0.05971646, 0.07332599, ..., 0.        , 0.00720619,
        0.0714361 ],
       [0.05971646, 1.        , 0.07170234, ..., 0.01553378, 0.02818649,
        0.0698543 ],
       [0.07332599, 0.07170234, 1.        , ..., 0.01695463, 0.01538234,
        0.06238128],
       ...,
       [0.        , 0.01553378, 0.01695463, ..., 1.        , 0.02999222,
        0.        ],
       [0.00720619, 0.02818649, 0.01538234, ..., 0.02999222, 1.        ,
        0.01226117],
       [0.0714361 , 0.0698543 , 0.06238128, ..., 0.        , 0.01226117,
        1.        ]])

In [153]:
smd[smd["movieId"].isin(pd.DataFrame(get_top_n_similar([], 58559, count_cosine_sim))[0])]

Unnamed: 0,id,movieId,title,description
132,414,153,Batman Forever,The Dark Knight of Gotham City confronts a das...
524,268,592,Batman,The Dark Knight of Gotham City begins his war ...
1113,364,1377,Batman Returns,"Having defeated the Joker, Batman now faces th..."
1239,415,1562,Batman & Robin,Along with crime-fighting partner Robin and ne...
2578,14919,3213,Batman: Mask of the Phantasm,An old flame of Bruce Wayne's strolls into tow...
6141,272,33794,Batman Begins,"Driven by tragedy, billionaire Bruce Wayne ded..."
7561,40662,79274,Batman: Under the Red Hood,Batman faces his ultimate challenge as the mys...
7926,49026,91529,The Dark Knight Rises,Following the death of District Attorney Harve...
8160,123025,98124,"Batman: The Dark Knight Returns, Part 1",Batman has not been seen for ten years. A new ...
8222,142061,99813,"Batman: The Dark Knight Returns, Part 2",Batman has stopped the reign of terror that Th...


In [154]:
# compare with cosine_sim (tf-idf)
cosine_sim

array([[1.00000000e+00, 6.48360418e-03, 3.37907959e-03, ...,
        0.00000000e+00, 1.62479128e-04, 6.40625307e-03],
       [6.48360418e-03, 1.00000000e+00, 1.27211815e-02, ...,
        1.47788912e-03, 2.84763270e-03, 5.54217334e-03],
       [3.37907959e-03, 1.27211815e-02, 1.00000000e+00, ...,
        1.96174884e-03, 1.69264945e-03, 1.84567702e-03],
       ...,
       [0.00000000e+00, 1.47788912e-03, 1.96174884e-03, ...,
        1.00000000e+00, 4.25637962e-03, 0.00000000e+00],
       [1.62479128e-04, 2.84763270e-03, 1.69264945e-03, ...,
        4.25637962e-03, 1.00000000e+00, 1.15004003e-03],
       [6.40625307e-03, 5.54217334e-03, 1.84567702e-03, ...,
        0.00000000e+00, 1.15004003e-03, 1.00000000e+00]])

In [155]:
count_vectorized_recommended_df = full_recommend(train_df, count_cosine_sim)

In [156]:
evaluate(count_vectorized_recommended_df, testset)

0.08867362146050671

#### By the way, find title for movie suggested by collaborative filtering

SVDpp

In [159]:
rec_583 = [926, 3462, 7502, 905, 904, 527, 1172, 1254, 969, 968]

smd[smd.movieId.isin(rec_583)]

Unnamed: 0,id,movieId,title,description
472,424,527,Schindler's List,The true story of how businessman Oskar Schind...
722,567,904,Rear Window,"Professional photographer L.B. ""Jeff"" Jeffries..."
723,3078,905,It Happened One Night,Ellie Andrews has just tied the knot with soci...
743,705,926,All About Eve,From the moment she glimpses her idol at the s...
782,10331,968,Night of the Living Dead,A group of people try to survive an attack of ...
783,488,969,The African Queen,"At the start of the first World War, in the mi..."
927,11216,1172,Cinema Paradiso,"A filmmaker recalls his childhood, when he fel..."
1004,3090,1254,The Treasure of the Sierra Madre,"Fred C. Dobbs and Bob Curtin, both down on the..."
2759,3082,3462,Modern Times,The Tramp struggles to live in modern industri...


In [160]:
rec_630=[3462, 318, 969, 926, 922, 905, 1148, 2019, 1197, 527]
smd[smd.movieId.isin(rec_630)]

Unnamed: 0,id,movieId,title,description
284,278,318,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
472,424,527,Schindler's List,The true story of how businessman Oskar Schind...
723,3078,905,It Happened One Night,Ellie Andrews has just tied the knot with soci...
740,599,922,Sunset Boulevard,A hack screenwriter writes a screenplay for a ...
743,705,926,All About Eve,From the moment she glimpses her idol at the s...
783,488,969,The African Queen,"At the start of the first World War, in the mi..."
912,531,1148,The Wrong Trousers,Gromit finds himself being pushed out of his r...
950,2493,1197,The Princess Bride,"In this enchantingly cracked fairy tale, the b..."
1577,346,2019,Seven Samurai,A samurai answers a village's request for prot...
2759,3082,3462,Modern Times,The Tramp struggles to live in modern industri...


BaselineOnly

In [161]:
rec_583_base = [858, 1221, 50, 912, 527, 1193, 926, 908, 3462, 904]
smd[smd.movieId.isin(rec_583_base)]

Unnamed: 0,id,movieId,title,description
48,629,50,The Usual Suspects,"Held in an L.A. interrogation room, Verbal Kin..."
472,424,527,Schindler's List,The true story of how businessman Oskar Schind...
692,238,858,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
722,567,904,Rear Window,"Professional photographer L.B. ""Jeff"" Jeffries..."
726,213,908,North by Northwest,Advertising man Roger Thornhill is mistaken fo...
730,289,912,Casablanca,"In Casablanca, Morocco in December 1941, a cyn..."
743,705,926,All About Eve,From the moment she glimpses her idol at the s...
947,510,1193,One Flew Over the Cuckoo's Nest,While serving time for insanity at a state men...
973,240,1221,The Godfather: Part II,In the continuing saga of the Corleone crime f...
2759,3082,3462,Modern Times,The Tramp struggles to live in modern industri...


In [162]:
rec_630_base = [318, 50, 527, 1193, 926, 3462, 608, 296, 913, 1228]
smd[smd.movieId.isin(rec_630_base)]

Unnamed: 0,id,movieId,title,description
48,629,50,The Usual Suspects,"Held in an L.A. interrogation room, Verbal Kin..."
266,680,296,Pulp Fiction,"A burger-loving hit man, his philosophical par..."
284,278,318,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
472,424,527,Schindler's List,The true story of how businessman Oskar Schind...
535,275,608,Fargo,"Jerry, a small-town Minnesota car salesman is ..."
731,963,913,The Maltese Falcon,A private detective takes on a case that invol...
743,705,926,All About Eve,From the moment she glimpses her idol at the s...
947,510,1193,One Flew Over the Cuckoo's Nest,While serving time for insanity at a state men...
980,1578,1228,Raging Bull,When Jake LaMotta steps into a boxing ring and...
2759,3082,3462,Modern Times,The Tramp struggles to live in modern industri...
