# Content-Based Filtering for Text Data

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
df = pd.read_csv('Dataset/netflix_titles.csv')
df['description'] = df['description'].fillna('')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim, num_recommend = 10):
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar = sim_scores[1:num_recommend+1]
    movie_indices = [i[0] for i in top_similar]
    ret_df = pd.DataFrame(df['title'].iloc[movie_indices])
    ret_df['score'] = np.array(top_similar)[:,1]
    return ret_df

In [21]:
get_recommendations('Power Rangers Zeo', num_recommend = 20)

title
Dick Johnson Is Dead        0
Blood & Water               1
Ganglands                   2
Jailbirds New Orleans       3
Kota Factory                4
                         ... 
Zodiac                   8802
Zombie Dumb              8803
Zombieland               8804
Zoom                     8805
Zubaan                   8806
Length: 8807, dtype: int64


Unnamed: 0,title,score
7771,Power Rangers RPM,0.180172
7773,Power Rangers Samurai,0.160981
7763,Power Rangers Dino Thunder,0.139407
8183,The Adventures of Sharkboy and Lavagirl,0.129872
7765,Power Rangers Jungle Fury,0.127489
7781,Power Rangers Super Samurai: Trickster Treat,0.120123
719,Power Rangers Dino Fury,0.118934
3946,Possessed,0.11833
7764,Power Rangers in Space,0.117821
7780,Power Rangers Super Samurai: Stuck on Christmas,0.117008


# Books Data

Dataset URL: https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset

In [14]:
DATASET_DIR = 'Dataset/books/'
books = pd.read_csv(os.path.join(DATASET_DIR, 'Books.csv'))
display(books.head())

ratings = pd.read_csv(os.path.join(DATASET_DIR, 'Ratings.csv'))
display(ratings.head())

users = pd.read_csv(os.path.join(DATASET_DIR, 'Users.csv'))
display(users.head())

  books = pd.read_csv(os.path.join(DATASET_DIR, 'Books.csv'))


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [15]:
books_with_ratings = ratings.merge(books, on='ISBN')
num_rating_df = books_with_ratings.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace=True)
num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [51]:
avg_rating_df = books_with_ratings.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating'}, inplace=True)
avg_rating_df

Unnamed: 0,Book-Title,avg_rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


## Content-Based Filtering

In [16]:
books_with_ratings['text'] = books_with_ratings['Book-Title'] + ', ' + books_with_ratings['Book-Author'] + ', ' + books_with_ratings['Publisher']
books_with_ratings.dropna(subset=['text'], inplace=True)
books_with_ratings['text'] = books_with_ratings['text'].str.lower()

In [17]:
x = books_with_ratings.copy()
x = x.sample(10000)
x = x.drop_duplicates(subset=['text']).reset_index(drop=True)
x['text'].duplicated().sum()

0

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(x['text']).toarray()
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
def get_recommendations(title, cosine_sim=cosine_sim, num_recommend = 10):
    indices = pd.Series(x.index, index=x['Book-Title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar = sim_scores[1:num_recommend+1]
    movie_indices = [i[0] for i in top_similar]
    ret_df = pd.DataFrame(x['Book-Title'].iloc[movie_indices])
    ret_df['score'] = np.array(top_similar)[:,1]
    return ret_df

In [14]:
x['Book-Title']

0                          The Ladies of Missalonghi
1       White Oleander : A Novel (Oprah's Book Club)
2                           The Pillars of the Earth
3                                      Thale's Folly
4                   Jimmy Spoon and the Pony Express
                            ...                     
8610                          No Telephone to Heaven
8611              I Never Promised You a Rose Garden
8612                        Enchantment (Heartspell)
8613                    The School of Night: A Novel
8614                   See Jane Score (Avon Romance)
Name: Book-Title, Length: 8615, dtype: object

In [19]:
get_recommendations('No Telephone to Heaven')

Unnamed: 0,Book-Title,score
4484,Heaven on Earth (Diamond),0.275034
8450,Cheet (Plume Books),0.265605
1601,Fire of Heaven (Book 3 of the Fire of Heaven T...,0.262774
1383,The Bookman's Promise : A Cliff Janeway Novel ...,0.251544
7348,The Blind Side of the Heart: A Novel,0.245819
4662,Standing in the Shadows,0.241086
4871,Jazz (Plume Contemporary Fiction),0.233592
8501,Halfway Heaven: Diary of a Harvard Murder,0.233117
934,Beloved (Plume Contemporary Fiction),0.225262
7260,And Heaven Too,0.196199


## Collaborative Filtering

In [56]:
x = books_with_ratings.groupby('User-ID').count()['Book-Rating'] > 200
active_users = x[x].index
filtered_rating = books_with_ratings[books_with_ratings['User-ID'].isin(active_users)]
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index

final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [57]:
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)  # Replace missing values with 0
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
similarity_scores = cosine_similarity(pt)
similarity_scores

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [60]:
similarity_scores.shape

(706, 706)

In [None]:
def recommend(book_name):
    # Find the index of the book
    index = np.where(pt.index == book_name)[0][0]
    # Get the most similar books
    similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:10]
    
    data = []
    for i in similar_items:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    return data

In [65]:
recommend('1984')

[['Animal Farm',
  'George Orwell',
  'http://images.amazon.com/images/P/0451526341.01.MZZZZZZZ.jpg'],
 ["The Handmaid's Tale",
  'Margaret Atwood',
  'http://images.amazon.com/images/P/0449212602.01.MZZZZZZZ.jpg'],
 ['Brave New World',
  'Aldous Huxley',
  'http://images.amazon.com/images/P/0060809833.01.MZZZZZZZ.jpg'],
 ['The Vampire Lestat (Vampire Chronicles, Book II)',
  'ANNE RICE',
  'http://images.amazon.com/images/P/0345313860.01.MZZZZZZZ.jpg'],
 ['The Hours : A Novel',
  'Michael Cunningham',
  'http://images.amazon.com/images/P/0312243022.01.MZZZZZZZ.jpg'],
 ['Fahrenheit 451',
  'Ray Bradbury',
  'http://images.amazon.com/images/P/3257208626.01.MZZZZZZZ.jpg'],
 ['The Catcher in the Rye',
  'J.D. Salinger',
  'http://images.amazon.com/images/P/0316769487.01.MZZZZZZZ.jpg'],
 ['Naked',
  'David Sedaris',
  'http://images.amazon.com/images/P/0316777730.01.MZZZZZZZ.jpg'],
 ['The Hundred Secret Senses',
  'Amy Tan',
  'http://images.amazon.com/images/P/0399141146.01.MZZZZZZZ.jpg']

# Movie Data
Dataset URL: https://www.kaggle.com/datasets/akshaypawar7/millions-of-movies/code
## Using numerical data only

In [96]:
from datetime import datetime as dt
from sklearn.neighbors import NearestNeighbors

In [3]:
df = pd.read_csv('./Dataset/movies.csv')
df = df[(df['original_language']=='en')].reset_index(drop=True)
df = df.drop(columns=['revenue','poster_path','backdrop_path','original_language'])
df.head()

Unnamed: 0,id,title,genres,overview,popularity,production_companies,release_date,budget,runtime,status,tagline,vote_average,vote_count,credits,keywords,recommendations
0,615656,Meg 2: The Trench,Action-Science Fiction-Horror,An exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,116.0,Released,Back for seconds.,7.079,1365.0,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,based on novel or book-sequel-kaiju,1006462-298618-569094-1061181-346698-1076487-6...
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,spain-rome italy-vatican-pope-pig-possession-c...,713704-296271-502356-1076605-1084225-1008005-9...
2,533535,Deadpool & Wolverine,Action-Comedy-Science Fiction,A listless Wade Wilson toils away in civilian ...,5410.496,Marvel Studios-Maximum Effort-21 Laps Entertai...,2024-07-24,200000000.0,128.0,Released,Come together.,7.765,3749.0,Ryan Reynolds-Hugh Jackman-Emma Corrin-Matthew...,hero-superhero-anti hero-mutant-breaking the f...,573435-519182-957452-1022789-945961-718821-103...
3,667538,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,When a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,127.0,Released,Unite or fall.,7.34,1007.0,Anthony Ramos-Dominique Fishback-Luna Lauren V...,peru-alien-end of the world-based on cartoon-b...,496450-569094-298618-385687-877100-598331-4628...
4,693134,Dune: Part Two,Science Fiction-Adventure,Follow the mythic journey of Paul Atreides as ...,4742.163,Legendary Pictures,2024-02-27,190000000.0,167.0,Released,Long live the fighters.,8.3,2770.0,Timothée Chalamet-Zendaya-Rebecca Ferguson-Jav...,epic-based on novel or book-fight-sandstorm-sa...,438631-763215-792307-1011985-467244-634492-359...


In [62]:
cols_in = ['id','title','genres','popularity','release_date','runtime','vote_average','credits']
x = df[cols_in].copy()
x = x[~x['genres'].isna()]
x = x.sample(10000).reset_index(drop=True)
x.head()

Unnamed: 0,id,title,genres,popularity,release_date,runtime,vote_average,credits
0,526073,The Pegasus Project,Thriller,1.101,2015-08-16,0.0,0.0,Flynn Falcone-Aaron Wolf
1,318952,Mm.. Food Drive Tour,Documentary-Music,0.6,2007-06-01,64.0,6.0,MF DOOM
2,715909,"Metallica: Live in Melbourne, Australia - Marc...",Music,1.459,2020-06-15,138.0,0.0,James Hetfield-Lars Ulrich-Kirk Hammett-Robert...
3,357002,West of El Dorado,Western,1.377,1949-06-04,58.0,0.0,Johnny Mack Brown-Max Terhune-Reno Browne-Tedd...
4,579498,All You Need,Drama,1.121,2001-03-04,105.0,2.0,Kayren Butler-Amy Raymond-Kellie Martin-Janet ...


In [63]:
x['months_released'] = (dt.now() - pd.to_datetime(x['release_date'])).dt.days//365
x.describe()

Unnamed: 0,id,popularity,runtime,vote_average,months_released
count,10000.0,10000.0,9901.0,10000.0,9646.0
mean,473513.1998,3.231925,59.855166,3.175018,27.858594
std,269833.932995,26.617956,49.581532,3.280838,29.900667
min,167.0,0.001,0.0,0.0,-3.0
25%,259943.25,0.6,13.0,0.0,7.0
50%,465577.0,0.871,66.0,3.0,14.0
75%,692523.25,1.90425,90.0,6.0,38.0
max,967945.0,1811.36,999.0,10.0,137.0


In [64]:
x.loc[~x['genres'].isna(),'action'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'action' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'scifi'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'science fiction' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'horror'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'horror' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'comedy'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'comedy' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'drama'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'drama' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'thriller'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'thriller' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'animation'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'animation' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'history'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'history' in str(x).lower() else 0)
x.loc[~x['genres'].isna(),'crime'] = df.loc[~df['genres'].isna(),'genres'].apply(lambda x: 1 if 'crime' in str(x).lower() else 0)

x = x.drop(columns=['genres','release_date','credits'])

x.head()

Unnamed: 0,id,title,popularity,runtime,vote_average,months_released,action,scifi,horror,comedy,drama,thriller,animation,history,crime
0,526073,The Pegasus Project,1.101,0.0,0.0,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,318952,Mm.. Food Drive Tour,0.6,64.0,6.0,17.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,715909,"Metallica: Live in Melbourne, Australia - Marc...",1.459,138.0,0.0,4.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,357002,West of El Dorado,1.377,58.0,0.0,75.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,579498,All You Need,1.121,105.0,2.0,23.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
x = x.dropna().reset_index(drop=True)

### Content-based

In [66]:
ftrs_in = np.array(x[x.columns[2:]].copy())
ftrs_in.shape

(9499, 13)

In [80]:
cosine_sim = cosine_similarity(ftrs_in, np.array([ftrs_in[1231]]))
cosine_sim

array([[0.22780267],
       [0.99650798],
       [0.97678267],
       ...,
       [0.98019636],
       [0.494821  ],
       [0.98836594]])

In [81]:
x_ret = x.copy()
x_ret['similarity'] = cosine_sim.ravel()

In [82]:
x_ret.loc[x_ret['similarity'].sort_values(ascending=False).head(10).index]

Unnamed: 0,id,title,popularity,runtime,vote_average,months_released,action,scifi,horror,comedy,drama,thriller,animation,history,crime,similarity
1231,16077,My Little Eye,7.214,95.0,5.4,22.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1777,2577,Code 46,7.439,93.0,6.2,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999879
7232,451555,The Sex Substitute,6.957,95.0,5.6,23.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.999841
778,10449,When a Man Loves a Woman,10.322,126.0,6.5,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99984
7602,9776,Head of State,7.806,95.0,5.5,21.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.999826
7287,28660,The Mangler Reborn,5.406,84.0,4.8,19.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.999806
6124,16992,The Boxer,8.494,114.0,6.7,26.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.999795
7877,14242,American Movie,8.632,107.0,7.685,25.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.99979
461,10534,White Squall,9.31,129.0,6.293,28.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.999761
9386,29135,No Such Thing,6.736,102.0,5.9,22.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.999744


In [84]:
x.loc[1231]

id                         16077
title              My Little Eye
popularity                 7.214
runtime                     95.0
vote_average                 5.4
months_released             22.0
action                       0.0
scifi                        0.0
horror                       0.0
comedy                       0.0
drama                        0.0
thriller                     0.0
animation                    1.0
history                      0.0
crime                        0.0
Name: 1231, dtype: object

In [92]:
df.loc[df['id'] == 16077,'recommendations'].values[0].split('-')

['10065',
 '13788',
 '9378',
 '7978',
 '10145',
 '11096',
 '4970',
 '28355',
 '924',
 '9358',
 '565',
 '170',
 '747',
 '176']

In [90]:
x_ret.loc[x_ret['similarity'].sort_values(ascending=False).head(10).index,'id'].values

array([ 16077,   2577, 451555,  10449,   9776,  28660,  16992,  14242,
        10534,  29135], dtype=int64)

### Using KNN

In [93]:
ftrs_in = np.array(x[x.columns[2:]].copy())
ftrs_in.shape

(9499, 13)

In [97]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(ftrs_in)

In [100]:
target_user_index = 1231
distances, indices = knn.kneighbors(ftrs_in[target_user_index].reshape(1,-1),n_neighbors=10)

In [108]:
x.loc[indices.ravel()]

Unnamed: 0,id,title,popularity,runtime,vote_average,months_released,action,scifi,horror,comedy,drama,thriller,animation,history,crime
1231,16077,My Little Eye,7.214,95.0,5.4,22.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1777,2577,Code 46,7.439,93.0,6.2,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7232,451555,The Sex Substitute,6.957,95.0,5.6,23.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
778,10449,When a Man Loves a Woman,10.322,126.0,6.5,30.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7602,9776,Head of State,7.806,95.0,5.5,21.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7287,28660,The Mangler Reborn,5.406,84.0,4.8,19.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6124,16992,The Boxer,8.494,114.0,6.7,26.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
7877,14242,American Movie,8.632,107.0,7.685,25.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
461,10534,White Squall,9.31,129.0,6.293,28.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9386,29135,No Such Thing,6.736,102.0,5.9,22.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
