In [1]:
import pandas as pd
import numpy as np
from functools import reduce
import json
import psycopg2
import re
import sqlalchemy
import io
#import msgpack
#import msgpack_numpy as m

In [2]:
# load the data
df = pd.read_csv('./data/raw-data/netflix_list.csv')

In [3]:
df['type'].value_counts()

movie           2923
tvSeries        2199
tvEpisode        785
tvSpecial        391
tvMiniSeries     318
tvMovie          161
short            112
video            110
tvShort            6
videoGame          1
Name: type, dtype: int64

In [4]:
df = df[df['type'].isin(['movie', 'tvSeries', 'tvSpecial', 'tvMiniSeries', 'tvMovie'])]
df = df[(df['plot'] != '-') & (df['summary'] != '-') & (df['cast'] != '-')]
searchable_titles = df[(df['rating'] >= 5) & (df['numVotes'] > df['numVotes'].quantile(0.75))]['title']

df = df[(df['rating'] >= 5) & (df['numVotes'] > df['numVotes'].quantile(0.25))]

# convert columns "director, listed_in, cast and country" in columns that contain a real list
# the strip function is applied on the elements
# if the value is NaN, the new column contains a empty list []
#df['categories'] = df['genres'].apply(lambda x: x.split(","))
#df['actors'] = df['cast'].apply(lambda x: x.split(","))
#df['countries'] = df['orign_country'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df['summary'].fillna(value="i", inplace=True)
df['plot'].fillna(value="i", inplace=True)
df['genres'].fillna(value="i", inplace=True)
df['cast'].fillna(value="i", inplace=True)

df['genres'] = df['genres'].replace(0, 'i')
df['cast'] = df['cast'].replace(0, 'i')

df = df.reset_index()
df.head()

Unnamed: 0,index,imdb_id,title,popular_rank,certificate,startYear,endYear,episodes,runtime,type,orign_country,language,plot,summary,rating,numVotes,genres,isAdult,cast,image_url
0,0,tt4052886,Lucifer,1,15,2016.0,,93.0,42,tvSeries,United States,English,Lucifer Morningstar has decided he's had enoug...,"Lucifer Morningstar, bored from his sulking li...",8.1,250884.0,"Crime,Drama,Fantasy",0,"['Tom Ellis', 'Lauren German', 'Lesley-Ann Bra...",https://m.media-amazon.com/images/M/MV5BNzY1Yj...
1,1,tt0993840,Army of the Dead,2,18,2021.0,,,148,movie,United States,English,"Following a zombie outbreak in Las Vegas, a gr...","With the abandoned, walled city of Las Vegas o...",5.8,110780.0,"Action,Crime,Horror",0,"['Dave Bautista', 'Ella Purnell', 'Ana de la R...",https://m.media-amazon.com/images/M/MV5BNGY0Nz...
2,2,tt7255502,The Kominsky Method,3,18,2018.0,2021.0,22.0,30,tvSeries,United States,English,"An aging actor, who long ago enjoyed a brush w...",Michael Douglas plays an actor who made it big...,8.2,28795.0,"Comedy,Drama",0,"['Michael Douglas', 'Sarah Baker', 'Graham Rog...",https://m.media-amazon.com/images/M/MV5BMzA0YT...
3,3,tt0108778,Friends,4,13+,1994.0,2004.0,235.0,22,tvSeries,United States,English,Follows the personal and professional lives of...,"Ross Geller, Rachel Green, Monica Geller, Joey...",8.9,861843.0,"Comedy,Romance",0,"['Jennifer Aniston', 'Courteney Cox', 'Lisa Ku...",https://m.media-amazon.com/images/M/MV5BNDVkYj...
4,4,tt9251798,Ragnarok,5,18,2020.0,,12.0,45,tvSeries,Norway,Norwegian,A small Norwegian town experiencing warm winte...,In the small fictional town of Edda coming of ...,7.5,26606.0,"Action,Drama,Fantasy",0,"['David Stakston', 'Jonas Strand Gravli', 'Her...",https://m.media-amazon.com/images/M/MV5BODM3NT...


## Uding TFIDF to find similar movies based on description

- Filter out keywords that occur only once.
- Convert every word to its stem so that words such as Dogs and Dog are considered the same.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['netflix', 'academy', 'award', 'wiki', 'asianwiki', 'yahoo.com', 'mydramalist', 'vogel', 'jlvogel', 'comcast.net', 'u.washington.edu'])

def tokenize(text):
    
    # Tokenize by sentence, then by word
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    tokens = [t for t in tokens if t.lower() not in stopwords]
    
    # Filter out raw tokens to remove noise
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    filtered_tokens = [token for token in tokens if ((token[0] != '—') & (token[0] != '.'))]

    return filtered_tokens

# Define a function to perform both stemming and tokenization
def tokenize_and_stem(text):
    
    filtered_tokens = tokenize(text)

    # Stem the filtered_tokens
    stems = [wordnet_lemmatizer.lemmatize(t) for t in filtered_tokens]
    #stems = [stemmer.stem(t) for t in filtered_tokens]
    
    stems = [s for s in stems if len(s) > 2]

    return stems

In [6]:
# Build the tfidf matrix with the descriptions
vector = TfidfVectorizer(max_df=0.8, # drop words that occur in more than X percent of documents
                         max_features=200000,
                         min_df=3, # only use words that appear at least X times
                         #stop_words='english', # remove stop words
                         lowercase=True, # Convert everything to lower case                          
                         tokenizer=tokenize_and_stem,
                         ngram_range=(1,3)
                        )

tfidf_plot = vector.fit_transform(df['plot'])
tfidf_summary = vector.fit_transform(df['summary'])

In [7]:
cosine_similarities_plot = cosine_similarity(tfidf_plot,tfidf_plot)
cosine_similarities_summary = cosine_similarity(tfidf_summary,tfidf_summary)

In [8]:
# get list of stems and list of original tokens
# stems_list = []
# tokens_list = []
# for d in df['plot'][0:10]:
#     stems = tokenize_and_stem(d)
#     stems_list.extend(stems)

#     filtered_tokens = tokenize(d)
#     tokens_list.extend(filtered_tokens)

# # get a mapping
# stem_to_word_dict = {stems_list[x] : tokens_list[x] for x in range(len(tokens_list))}

In [9]:
titles = df['title']
indices= pd.Series(df.index, index=df['title'])

def get_recommendations(title, cosine_sim, cosine_sim2, mode='prod'):
    idx = indices[title]

    if(cosine_sim2 != ""):
        sim_scores = list(cosine_sim[idx]*2)
        sim_scores2 = list(cosine_sim2[idx])
        sim_scores = [[k,value1 + value2] for k,(value1, value2) in enumerate(zip(sim_scores,sim_scores2))]
    else:
        sim_scores = enumerate(list(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    movies = df.iloc[movie_indices]
    movies['scores'] = movie_scores
    movies['search_title'] = title

    if(mode == 'dev'):
        return movies[['imdb_id', 'search_title', 'title', 'scores', 'cast', 'genres', 'plot']]
    else:
        return movies[['imdb_id', 'search_title', 'title', 'scores']]


In [10]:
df_words = pd.DataFrame(vector.transform(df['plot']).todense(),
                   columns=vector.get_feature_names(), index=df.index)

In [11]:
df_summary = pd.DataFrame(vector.transform(df['summary']).todense(),
                   columns=vector.get_feature_names(), index=df.index)

### House of Cards

In [12]:
get_recommendations('House of Cards', cosine_similarities_plot, cosine_similarities_summary).head(10)

  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
146,tt1856010,House of Cards,House of Cards,3.0
958,tt1534360,House of Cards,Ezel,0.827538
246,tt1837642,House of Cards,Revenge,0.795818
1262,tt6063050,House of Cards,Acrimony,0.483013
2811,tt11827806,House of Cards,Corazón loco,0.424224
2740,tt0049405,House of Cards,The Killer Is Loose,0.378277
199,tt2820852,House of Cards,Fast & Furious 7,0.364562
1855,tt7530986,House of Cards,Mademoiselle de Joncquières,0.353489
2905,tt0173554,House of Cards,H,0.347388
918,tt8361028,House of Cards,Cam,0.344871


In [13]:
print(df[df['title'] == 'Revenge']['plot'].values[0])
print(df_words.transpose().sort_values(246, ascending=False).head(5).transpose().loc[246, :])
print(df[df['title'] == 'Revenge'].summary.values[0])
print(df_summary.transpose().sort_values(246, ascending=False).head(5).transpose().loc[246, :])

An emotionally troubled young woman sets out to exact revenge against the people who wronged her father.
wronged        0.424208
emotionally    0.424208
exact          0.413937
troubled       0.319636
young woman    0.294495
Name: 246, dtype: float64
As a summer to remember begins in the Hamptons, new arrival Emily Thorne dazzles the members of high society by making herself known in the exclusive social circle of Grayson Global CEO Conrad Grayson and his socialite wife Victoria. But it soon becomes clear that the beguiling young philanthropist has a dark past. Emily was once known as Amanda Clarke, a young eight-year-old (in the summer of 1993) whose life was torn apart when her father - Grayson Global hedge fund manager David Clarke - was falsely accused of channeling money to a terrorist organization responsible for the downing of a commercial airliner. Now living under an assumed identity, she is determined to seek vengeance on the people who destroyed her father's life - the two m

In [14]:
print(df[df['title'] == 'Ezel']['plot'].values[0])
print(df_words.transpose().sort_values(958, ascending=False).head(5).transpose().loc[958, :])
print(df[df['title'] == 'Ezel'].summary.values[0])
print(df_summary.transpose().sort_values(958, ascending=False).head(5).transpose().loc[958, :])

Betrayed by his trusted friends and the woman he loved, Ömer Uçar returns as Ezel to exact his vengeance.
exact        0.455584
trusted      0.423912
betrayed     0.412608
vengeance    0.398773
loved        0.369632
Name: 958, dtype: float64
Omer is a young man, full of beauty and good spirit. He has just returned from military service and is about to marry the girl of his dreams, the stunning Eysan. His two best friends, Cengiz and Ali are like brothers to him, and he trusts them with all his heart. Everything seems perfect, as his whole life lies in front of him. The drama starts when the police storm into Omer's bedroom when he is still sleeping. He gets arrested for a murder and robbery at the casino the night before. Omer gets imprisoned for the crime - his life is shattered, friends and fiancé disappear. Instead, one single question is on his mind for the next ten years: Why? It is obvious that Omer has been set up, but could such a terrible crime really have been planned by his 

In [15]:
print(df[df['title'] == 'House of Cards']['plot'].values[0])
print(df_words.transpose().sort_values(146, ascending=False).head(5).transpose().loc[146, :])
print(df[df['title'] == 'House of Cards'].summary.values[0])
print(df_summary.transpose().sort_values(146, ascending=False).head(5).transpose().loc[146, :])

A Congressman works with his equally conniving wife to exact revenge on the people who betrayed him.
exact       0.505867
betrayed    0.458147
equally     0.458147
revenge     0.346209
wife        0.280688
Name: 146, dtype: float64
Majority House Whip Francis Underwood takes you on a long journey as he exacts his vengeance on those he feels wronged him - that is, his own cabinet members including the President of the United States himself. Dashing, cunning, methodical and vicious, Frank Underwood along with his equally manipulative yet ambiguous wife, Claire, take Washington by storm through climbing the hierarchical ladder to power in this Americanized recreation of the BBC series of the same name. —Jacob Oberfrank
ladder     0.208547
bbc        0.208547
dashing    0.202545
whip       0.202545
wronged    0.202545
Name: 146, dtype: float64


### Narcos

In [16]:
get_recommendations('Narcos', cosine_similarities_plot, cosine_similarities_summary)

  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
110,tt2707408,Narcos,Narcos,3.000000
772,tt1355631,Narcos,The Infiltrator,1.124664
699,tt6692188,Narcos,El Chapo,0.671440
2359,tt7935522,Narcos,Drug Lords,0.643604
2985,tt3845960,Narcos,Cocaine Cowboys: Reloaded,0.527503
...,...,...,...,...
3422,tt4079776,Narcos,Uganda Be Kidding Me Live,0.000000
3423,tt1202160,Narcos,Chevolution,0.000000
3424,tt6680270,Narcos,Tracy Morgan: Staying Alive,0.000000
3425,tt8467788,Narcos,W. Kamau Bell: Private School Negro,0.000000


### Okja

In [17]:
get_recommendations('Okja', cosine_similarities_plot, cosine_similarities_summary).head(10)

  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
602,tt3967856,Okja,Okja,3.0
2277,tt1870529,Okja,Won't Back Down,0.506181
202,tt5727208,Okja,Uncut Gems,0.484567
2577,tt3070014,Okja,Sarah & Duck,0.477981
1485,tt3061050,Okja,Clarence,0.472268
2470,tt4466894,Okja,Sahara,0.437409
665,tt4547056,Okja,The Girl with All the Gifts,0.436329
2338,tt12079212,Okja,The Victims' Game,0.434553
75,tt6474378,Okja,Good Girls,0.427294
2894,tt3263996,Okja,The Overnighters,0.426294


In [18]:
print(df[df['title'] == "Okja"]['plot'].values[0])
print(df_words.transpose().sort_values(602, ascending=False).head(5).transpose().loc[602, :])
print(df[df['title'] == "Okja"].summary.values[0])
print(df_summary.transpose().sort_values(602, ascending=False).head(5).transpose().loc[602, :])

A young girl risks everything to prevent a powerful, multinational company from kidnapping her best friend - a fascinating beast named Okja.
multinational      0.348097
risk everything    0.329894
fascinating        0.306960
kidnapping         0.298774
beast              0.295180
Name: 602, dtype: float64
For 10 idyllic years, young Mija (An Seo Hyun) has been caretaker and constant companion to Okja-a massive animal and an even bigger friend-at her home in the mountains of South Korea. But that changes when a family-owned multinational conglomerate Mirando Corporation takes Okja for themselves and transports her to New York, where image obsessed and self-promoting CEO Lucy Mirando (Tilda Swinton) has big plans for Mija's dearest friend. With no particular plan but single-minded in intent, Mija sets out on a rescue mission, but her already daunting journey quickly becomes more complicated when she crosses paths with disparate groups of capitalists, demonstrators and consumers, each bat

In [19]:
print(df[df['title'] == "Won't Back Down"]['plot'].values[0])
print(df_words.transpose().sort_values(2277, ascending=False).head(5).transpose().loc[2277, :])
print(df[df['title'] == "Won't Back Down"].summary.values[0])
print(df_summary.transpose().sort_values(2277, ascending=False).head(5).transpose().loc[2277, :])

Two determined mothers­, one a teacher, look to transform their children's failing inner city school. Facing a powerful and entrenched bureaucracy, they risk everything to make a difference in the education and future of... Read all
risk everything    0.281301
bureaucracy        0.281301
inner city         0.281301
failing            0.258097
education          0.254766
Name: 2277, dtype: float64
Two determined mothers with children who are failing in an inner city school in Pittsburgh join forces to take back the school, and turn it into a place of learning. But before they can change the school for the better, they must first battle the parents, the school board, and the teachers union. Because this is for their children, they won't back down from this enormous challenge. —Douglas Young (the-movie-guy)
school                 0.410958
child                  0.226558
back school            0.220642
young the-movie-guy    0.214292
the-movie-guy          0.214292
Name: 2277, dtype: float

In [20]:
print(df[df['title'] == "Uncut Gems"]['plot'].values[0])
print(df_words.transpose().sort_values(202, ascending=False).head(5).transpose().loc[202, :])
print(df[df['title'] == "Uncut Gems"].summary.values[0])
print(df_summary.transpose().sort_values(202, ascending=False).head(5).transpose().loc[202, :])

With his debts mounting and angry collectors closing in, a fast-talking New York City jeweler risks everything in hope of staying afloat and alive.
closing            0.307788
collector          0.307788
risk everything    0.307788
angry              0.301331
mounting           0.295738
Name: 202, dtype: float64
Howard Ratner (Adam Sandler) is a once successful New York gems dealer whose gambling addiction has left his family and career in shambles, and him hundreds of thousands in debt. Always looking for the next big bet, Howard thinks he finally hit it big when he discovers a rare uncut rock of Ethiopian gems, with a very interested high-profile buyer. But the closer Howard gets to finally winning big, the more he is forced to realize he can't keep running from the consequences of his actions.
howard     0.449337
gem        0.328967
big        0.290531
finally    0.202396
bet        0.164484
Name: 202, dtype: float64


In [21]:
print(df[df['title'] == "Clarence"]['plot'].values[0])
print(df_words.transpose().sort_values(1485, ascending=False).head(5).transpose().loc[1485, :])
print(df[df['title'] == "Clarence"].summary.values[0])
print(df_summary.transpose().sort_values(1485, ascending=False).head(5).transpose().loc[1485, :])

The adventures of a 4th grader named Clarence along with his two best friends, Sumo and Jeff.
clarence           0.378039
along two          0.378039
4th                0.378039
two best friend    0.324474
two best           0.316958
Name: 1485, dtype: float64
The adventures of a 4th grader named Clarence along with his two best friends, Sumo and Jeff.
clarence           0.378039
along two          0.378039
4th                0.378039
two best friend    0.324474
two best           0.316958
Name: 1485, dtype: float64


In [22]:
get_recommendations('Sweet Tooth', cosine_similarities_plot, cosine_similarities_summary).head(10)

  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
7,tt12809988,Sweet Tooth,Sweet Tooth,3.0
1984,tt11829316,Sweet Tooth,Vampires,1.113196
1099,tt10482560,Sweet Tooth,Kipo and the Age of Wonderbeasts,0.861105
2109,tt2459156,Sweet Tooth,Copenhagen,0.67796
2072,tt5228026,Sweet Tooth,The Characters,0.622848
1429,tt0416044,Sweet Tooth,Mongol,0.537138
41,tt4786824,Sweet Tooth,The Crown,0.515246
1476,tt0791205,Sweet Tooth,Ergo Proxy,0.491397
2304,tt1920885,Sweet Tooth,Da yu hai tang,0.486789
2034,tt7299298,Sweet Tooth,Svaha: The Sixth Finger,0.440582


In [23]:
print(df[df['title'] == "Sweet Tooth"]['plot'].values[0])
print(df_words.transpose().sort_values(7, ascending=False).head(5).transpose().loc[7, :])
print(df[df['title'] == "Sweet Tooth"].summary.values[0])
print(df_summary.transpose().sort_values(7, ascending=False).head(5).transpose().loc[7, :])

A boy who is half human and half deer survives in a post-apocalyptic world with other hybrids.
half                      0.591350
post-apocalyptic world    0.360051
deer                      0.341223
hybrid                    0.341223
post-apocalyptic          0.317501
Name: 7, dtype: float64
A boy who is half human and half deer survives in a post-apocalyptic world with other hybrids.
half                      0.591350
post-apocalyptic world    0.360051
deer                      0.341223
hybrid                    0.341223
post-apocalyptic          0.317501
Name: 7, dtype: float64


In [24]:
print(df[df['title'] == "Kipo and the Age of Wonderbeasts"]['plot'].values[0])
print(df_words.transpose().sort_values(1099, ascending=False).head(5).transpose().loc[1099, :])
print(df[df['title'] == "Kipo and the Age of Wonderbeasts"].summary.values[0])
print(df_summary.transpose().sort_values(1099, ascending=False).head(5).transpose().loc[1099, :])

A girl explores the possibilities in a post-apocalyptic world.
post-apocalyptic world    0.554201
post-apocalyptic          0.488707
possibility               0.469953
explores                  0.358291
girl                      0.252336
Name: 1099, dtype: float64
A girl explores the possibilities in a post-apocalyptic world.
post-apocalyptic world    0.554201
post-apocalyptic          0.488707
possibility               0.469953
explores                  0.358291
girl                      0.252336
Name: 1099, dtype: float64


In [25]:
print(df[df['title'] == "Copenhagen"]['plot'].values[0])
print(df_words.transpose().sort_values(2109, ascending=False).head(5).transpose().loc[2109, :])
print(df[df['title'] == "Copenhagen"].summary.values[0])
print(df_summary.transpose().sort_values(2109, ascending=False).head(5).transpose().loc[2109, :])

When the girl of your dreams is half your age, it's time to grow up.
girl dream    0.517582
half          0.458104
grow          0.438539
age           0.347917
dream         0.300519
Name: 2109, dtype: float64
After weeks of traveling through Europe the immature William finds himself at a crossroads in Copenhagen. Not just another European city, Copenhagen is also the birthplace of his father. When the youthful Effy befriends the older William they set off on an adventure to find William's grandfather. Effy's mix of youthful exuberance and wisdom challenges William unlike any woman ever has. As the attraction builds and William truly connects with someone for the first time in his life, he must come to grips with destabilizing elements of his family's sordid past. —Fidelio Films
william            0.626772
copenhagen         0.271014
come grip          0.146048
immature           0.146048
first time life    0.141845
Name: 2109, dtype: float64


### Using Count Vectorizer to find similar movies based on list of actors, directors, genres combined

In [26]:
df['actors'] = df['cast'].apply(lambda l: [] if pd.isna(l) else [re.sub('[^A-Za-z0-9]+', '', i) for i in l.split(",")])
#give high rating to top 3 actors
df['actors'] = df['actors'].apply(lambda x: [val + ',' + val if i<=2 else val for i,val in enumerate(x)])
df['actors'] = df['actors'].apply(lambda x: [val.split(',') for val in x])
df['actors'] = df['actors'].apply(lambda x: [item for sublist in x for item in sublist])
df['actors'] = df['actors'].apply(lambda x: ' '.join(x))

df['categories'] = df['genres'].apply(lambda l: [] if pd.isna(l) else [re.sub('[^A-Za-z0-9]+', '', i) for i in l.split(",")])
df['categories'] = df['categories'].apply(lambda x: ' '.join(x))

In [27]:
def countVectorizer(col):
    count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(df[col])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

In [28]:
cosine_sim_actors = countVectorizer('actors')
cosine_sim_categories = countVectorizer('categories')

### Combine each description, actors, genres similarity score to derive a total score

In [29]:
def improved_recommendations(title, mode):
    df_actors = get_recommendations(title, cosine_sim_actors, "", 'actors').rename(columns={'scores': 'actor_score'})
    df_categories = get_recommendations(title, cosine_sim_categories, "", 'categories').rename(columns={'scores': 'category_score'})
    df_description = get_recommendations(title, cosine_similarities_plot, cosine_similarities_summary, mode).rename(columns={'scores': 'description_score'})
    data_frames = [df_actors, df_categories, df_description]
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['search_title', 'title', 'imdb_id'], how='outer'), data_frames)
    df_merged['scores'] = df_merged['actor_score'] + df_merged['category_score']*0.25 + df_merged['description_score']
    df_merged = df_merged.sort_values('scores', ascending=False).reset_index(drop=True).head(10)
    
    # extract image url
    #for i,row in df_merged.iterrows():
        #image = df_photosAll[(df_photosAll['show_id'] == row.show_id)]['url']
        #df_merged.loc[i, 'image'] = image
        
    return df_merged


In [30]:
test1 = improved_recommendations('House of Cards', 'dev')
test1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,imdb_id,search_title,title,actor_score,category_score,description_score,cast,genres,plot,scores
0,tt1856010,House of Cards,House of Cards,1.0,1.0,3.0,"['Kevin Spacey', 'Michel Gill', 'Robin Wright'...",Drama,A Congressman works with his equally conniving...,4.25
1,tt1534360,House of Cards,Ezel,0.0,0.447214,0.827538,"['Kenan Imirzalioglu', 'Cansu Dere', 'Yigit Öz...","Crime,Drama,Thriller",Betrayed by his trusted friends and the woman ...,0.939342
2,tt1837642,House of Cards,Revenge,0.0,0.447214,0.795818,"['Madeleine Stowe', 'Emily VanCamp', 'Gabriel ...","Drama,Mystery,Thriller",An emotionally troubled young woman sets out t...,0.907621
3,tt7530986,House of Cards,Mademoiselle de Joncquières,0.0,0.57735,0.353489,"['Cécile de France', 'Edouard Baer', 'Alice Is...","Drama,Romance","Fooled by a notorious libertine, a widow plans...",0.497826
4,tt2199618,House of Cards,The Politician's Husband,0.0,1.0,0.247035,"['David Tennant', 'Emily Watson', 'Jack Shephe...",Drama,Political drama series about a marriage betwee...,0.497035
5,tt11506054,House of Cards,Uma Maheswara Ugra Roopasya,0.0,1.0,0.245113,"['Satyadev Kancharana', 'V.K. Naresh', 'Suhas'...",Drama,After getting beaten up and insulted by the to...,0.495113
6,tt0049405,House of Cards,The Killer Is Loose,0.0,0.447214,0.378277,"['Joseph Cotten', 'Rhonda Fleming', 'Wendell C...","Crime,Drama,Film-Noir","An unhinged, deceptively mild-mannered bank ro...",0.490081
7,tt6063050,House of Cards,Acrimony,0.0,0.0,0.483013,"['Taraji P. Henson', 'Lyriq Bent', 'Crystle St...",Thriller,"A faithful wife, tired of standing by her devi...",0.483013
8,tt8361028,House of Cards,Cam,0.0,0.447214,0.344871,"['Madeline Brewer', 'Patch Darragh', 'Melora W...","Drama,Horror,Mystery","Alice, an ambitious camgirl, wakes up one day ...",0.456675
9,tt0266697,House of Cards,Kill Bill: Vol. 1,0.0,0.447214,0.340671,"['Uma Thurman', 'David Carradine', 'Daryl Hann...","Action,Crime,Drama","After awakening from a four-year coma, a forme...",0.452474


In [31]:
test2 = improved_recommendations('Okja', 'dev')
test2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,imdb_id,search_title,title,actor_score,category_score,description_score,cast,genres,plot,scores
0,tt3967856,Okja,Okja,1.0,1.0,3.0,"['Tilda Swinton', 'Paul Dano', 'Seo-hyun Ahn',...","Action,Adventure,Drama",A young girl risks everything to prevent a pow...,4.25
1,tt4547056,Okja,The Girl with All the Gifts,0.0,1.0,0.436329,"['Sennia Nanua', 'Fisayo Akinade', 'Dominique ...","Action,Adventure,Drama",A scientist and a teacher living in a dystopia...,0.686329
2,tt1870529,Okja,Won't Back Down,0.0,0.447214,0.506181,"['Viola Davis', 'Maggie Gyllenhaal', 'Holly Hu...",Drama,"Two determined mothers­, one a teacher, look t...",0.617984
3,tt5727208,Okja,Uncut Gems,0.0,0.2,0.484567,"['Adam Sandler', 'Julia Fox', 'Idina Menzel', ...","Crime,Drama,Thriller",With his debts mounting and angry collectors c...,0.534567
4,tt3061050,Okja,Clarence,0.0,0.2,0.472268,"['Spencer Rothbell', 'Katie Crown', 'Tom Kenny...","Adventure,Animation,Comedy",The adventures of a 4th grader named Clarence ...,0.522268
5,tt12079212,Okja,The Victims' Game,0.0,0.258199,0.434553,"['Hsiao-chuan Chang', 'Tsai-Hsing Chang', 'Wei...","Drama,Thriller",After discovering his estranged daughter's lin...,0.499103
6,tt3263996,Okja,The Overnighters,0.0,0.258199,0.426294,"['Jay Reinke', 'Andrea Reinke', 'Alan Mezo', '...","Documentary,Drama","Broken, desperate men chase their dreams and r...",0.490844
7,tt4466894,Okja,Sahara,0.0,0.2,0.437409,"['Omar Sy', 'Louane Emera', 'Franck Gastambide...","Adventure,Animation,Comedy",A young cobra and his scorpion best friend go ...,0.487409
8,tt1657507,Okja,Colombiana,0.0,0.4,0.379117,"['Zoe Saldana', 'Michael Vartan', 'Callum Blue...","Action,Drama,Thriller",A young girl in Bogotá witnesses her parents' ...,0.479117
9,tt3070014,Okja,Sarah & Duck,0.0,0.0,0.477981,"['Tasha Lawrence', 'Roger Allam', 'Andy Nyman'...","Animation,Family",Sarah is a young British girl whose best frien...,0.477981


In [32]:
test3 = improved_recommendations('Lucifer', 'dev')
test3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,imdb_id,search_title,title,actor_score,category_score,description_score,cast,genres,plot,scores
0,tt4052886,Lucifer,Lucifer,1.0,1.0,3.0,"['Tom Ellis', 'Lauren German', 'Lesley-Ann Bra...","Crime,Drama,Fantasy",Lucifer Morningstar has decided he's had enoug...,4.25
1,tt7913450,Lucifer,Soundtrack,0.0,0.258199,0.65996,"['Paul James', 'Callie Hernandez', 'Marianne J...","Drama,Musical",Music connects the lives of random people livi...,0.72451
2,tt1712192,Lucifer,Message from the King,0.0,0.6,0.443175,"['Chadwick Boseman', 'Luke Evans', 'Alfred Mol...","Action,Crime,Drama","A mysterious outsider from South Africa, named...",0.593175
3,tt2249007,Lucifer,Ray Donovan,0.0,0.774597,0.366539,"['Liev Schreiber', 'Eddie Marsan', 'Dash Mihok...","Crime,Drama","Ray Donovan, a professional ""fixer"" for the ri...",0.560188
4,tt3331028,Lucifer,Imperial Dreams,0.0,0.447214,0.434084,"['John Boyega', 'Rotimi', 'Glenn Plummer', ""De...",Drama,A 21-year-old reformed gangster's devotion to ...,0.545888
5,tt5884792,Lucifer,Disjointed,0.0,0.0,0.544587,"['Kathy Bates', 'Aaron Moten', 'Elizabeth Alde...",Comedy,Cannabis legend Ruth Whitefeather Feldman empl...,0.544587
6,tt5324116,Lucifer,Swedish Dicks,0.0,0.258199,0.452616,"['Peter Stormare', 'Johan Glans', 'Vivian Bang...","Comedy,Crime",Two unlicensed Swedish private investigators t...,0.517166
7,tt2507238,Lucifer,Me Him Her,0.0,0.0,0.496491,"['Dustin Milligan', 'Luke Bracey', 'Emily Mead...","Comedy,Romance",Vicenarian drifter Cory arrives in Los Angeles...,0.496491
8,tt1690967,Lucifer,Come Sunday,0.0,0.258199,0.430874,"['Chiwetel Ejiofor', 'Gerard Catus', 'Allie Mc...","Biography,Drama",Evangelist Carlton Pearson is ostracized by hi...,0.495423
9,tt8727582,Lucifer,Circus of Books,0.0,0.0,0.494567,"['Karen Mason', 'Barry Mason', 'Micah Mason', ...","Biography,Documentary,History","In 1976, Karen and Barry Mason had fallen on h...",0.494567


In [33]:
test4 = improved_recommendations('The Kominsky Method', 'dev')
test4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,imdb_id,search_title,title,actor_score,category_score,description_score,cast,genres,plot,scores
0,tt7255502,The Kominsky Method,The Kominsky Method,1.0,1.0,3.0,"['Michael Douglas', 'Sarah Baker', 'Graham Rog...","Comedy,Drama","An aging actor, who long ago enjoyed a brush w...",4.25
1,tt4685750,The Kominsky Method,Much Loved,0.0,0.57735,0.545605,"['Loubna Abidar', 'Asmaa Lazrak', 'Halima Kara...",Drama,A group of women in Morocco make a living as p...,0.689943
2,tt5324116,The Kominsky Method,Swedish Dicks,0.0,0.333333,0.471406,"['Peter Stormare', 'Johan Glans', 'Vivian Bang...","Comedy,Crime",Two unlicensed Swedish private investigators t...,0.554739
3,tt6708116,The Kominsky Method,Mauvaises herbes,0.0,0.57735,0.392392,"['Kheiron', 'Catherine Deneuve', 'André Dussol...",Comedy,"Wael (Kheiron) a former street child, makes a ...",0.536729
4,tt2562232,The Kominsky Method,Birdman or (The Unexpected Virtue of Ignorance),0.0,1.0,0.258212,"['Michael Keaton', 'Zach Galifianakis', 'Edwar...","Comedy,Drama",A washed-up superhero actor attempts to revive...,0.508212
5,tt10045434,The Kominsky Method,Kapi,0.0,1.0,0.254418,"['Kadir Inanir', 'Vahide Perçin', 'Timur Acar'...","Comedy,Drama",Yakup and Semsa are Syriac families living in ...,0.504418
6,tt4635276,The Kominsky Method,Master of None,0.0,1.0,0.248697,"['Aziz Ansari', 'Eric Wareheim', 'Lena Waithe'...","Comedy,Drama","The personal and professional life of Dev, a 3...",0.498697
7,tt10516390,The Kominsky Method,"Frankenstein's Monster's Monster, Frankenstein",0.0,0.57735,0.338177,"['David Harbour', 'Kate Berlant', 'Alex Ozerov...",Comedy,David Harbour delves into the enigmatic histor...,0.482515
8,tt11804034,The Kominsky Method,"Hi Bye, Mama!",0.0,0.774597,0.287139,"['Kim Tae-hee', 'Kyoo-hyung Lee', 'Ko Bo-Gyeol...","Comedy,Drama,Fantasy",It's the story of a mother who died and begins...,0.480788
9,tt12369754,The Kominsky Method,Staged,0.0,1.0,0.228857,"['Michael Sheen', 'David Tennant', 'Georgia Te...","Comedy,Drama",David Tennant and Michael Sheen star as two ac...,0.478857


In [34]:
test6 = improved_recommendations("Sweet Tooth", 'dev')
test6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,imdb_id,search_title,title,actor_score,category_score,description_score,cast,genres,plot,scores
0,tt12809988,Sweet Tooth,Sweet Tooth,1.0,1.0,3.0,"['Nonso Anozie', 'Christian Convery', 'Stefani...","Action,Adventure,Drama",A boy who is half human and half deer survives...,4.25
1,tt11829316,Sweet Tooth,Vampires,0.0,0.2,1.113196,"['Kate Moran', 'Mounir Amamra', 'Juliette Card...","Drama,Fantasy,Horror","A Parisian teenager who is half human, half va...",1.163196
2,tt10482560,Sweet Tooth,Kipo and the Age of Wonderbeasts,0.0,0.6,0.861105,"['Karen Fukuhara', 'Sydney Mikayla', 'Dee Brad...","Action,Adventure,Animation",A girl explores the possibilities in a post-ap...,1.011105
3,tt2459156,Sweet Tooth,Copenhagen,0.0,0.6,0.67796,"['Gethin Anthony', 'Frederikke Dahl Hansen', '...","Adventure,Drama,Romance","When the girl of your dreams is half your age,...",0.82796
4,tt0791205,Sweet Tooth,Ergo Proxy,0.0,0.6,0.491397,"['Kôji Yusa', 'Akiko Yajima', 'Rachel Hirschfe...","Action,Adventure,Animation",In a post-apocalyptic future humans live in pe...,0.641397
5,tt0416044,Sweet Tooth,Mongol,0.0,0.4,0.537138,"['Tadanobu Asano', 'Amadu Mamadakov', 'Khulan ...","Action,Biography,Drama",The story recounts the early life of Genghis K...,0.637138
6,tt5228026,Sweet Tooth,The Characters,0.0,0.0,0.622848,"['John T. Reynolds', 'Kate Berlant', 'John Ear...",Comedy,No rules. No expectations. A half hour to make...,0.622848
7,tt1920885,Sweet Tooth,Da yu hai tang,0.0,0.4,0.486789,"['Guanlin Ji', 'Shangqing Su', 'Timmy Xu', 'Sh...","Adventure,Animation,Drama",A 16-year-old girl travels to the human world ...,0.586789
8,tt4786824,Sweet Tooth,The Crown,0.0,0.2,0.515246,"['Claire Foy', 'Olivia Colman', 'Imelda Staunt...","Biography,Drama,History",Follows the political rivalries and romance of...,0.565246
9,tt1734135,Sweet Tooth,Trollhunters,0.0,0.6,0.399311,"['Charlie Saxton', 'Lexi Medrano', 'Anton Yelc...","Action,Adventure,Animation",An ordinary boy finds a magic amulet that choo...,0.549311


In [48]:
result = pd.DataFrame()
for i in searchable_titles:
    frames = improved_recommendations(i, 'prod')
    result = pd.concat([result, frames[1:6]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [49]:
result.shape

(6025, 7)

In [50]:
from dotenv import load_dotenv
load_dotenv()

import os

DATABASE_URL = 'postgresql+psycopg2://' + os.environ['DB_USER'] + ':' + os.environ['DB_PASSWORD']  + '@' + os.environ['DB_HOST'] + ':5432/' + os.environ['DB']
print(DATABASE_URL)

postgresql+psycopg2://gphliyrfcdlsmy:634a2ca99b57e6e5b2e771189219048ce85c9451eb5b6ad1b7c1164b444a2dba@ec2-35-153-114-74.compute-1.amazonaws.com:5432/dvff4i4r4j7ni


In [51]:
def write_to_db(df):
    """ insert a new df into the results table """
    try:
        engine = sqlalchemy.create_engine(DATABASE_URL)
        df.to_sql('results', engine, if_exists='replace',index=False) #drops old table and creates new empty table

    except (Exception, psycopg2.DatabaseError) as e:
        print(e)

def read_from_db():
    """ query data from the results table """
    try:
        engine = sqlalchemy.create_engine(DATABASE_URL)
        return pd.read_sql("SELECT * FROM results LIMIT 10", engine)
        
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)


def write_raw_to_db(df):
    """ insert a new df into the results table """
    try:
        engine = sqlalchemy.create_engine(DATABASE_URL)
        df.to_sql('titles', engine, if_exists='replace',index=False) #drops old table and creates new empty table

    except (Exception, psycopg2.DatabaseError) as e:
        print(e)


def read_from_raw_db():
    """ query data from the title table """
    try:
        engine = sqlalchemy.create_engine(DATABASE_URL)
        return pd.read_sql("SELECT * FROM titles WHERE title = 'Hard Breakers'", engine)
        
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)


In [52]:
write_to_db(result)

In [53]:
sql_table = read_from_db()
sql_table

Unnamed: 0,imdb_id,search_title,title,actor_score,category_score,description_score,scores
0,tt7913450,Lucifer,Soundtrack,0.0,0.258199,0.65996,0.72451
1,tt1712192,Lucifer,Message from the King,0.0,0.6,0.443175,0.593175
2,tt2249007,Lucifer,Ray Donovan,0.0,0.774597,0.366539,0.560188
3,tt3331028,Lucifer,Imperial Dreams,0.0,0.447214,0.434084,0.545888
4,tt5884792,Lucifer,Disjointed,0.0,0.0,0.544587,0.544587
5,tt3963816,Army of the Dead,Marauders,0.106383,0.6,0.284386,0.540769
6,tt6838918,Army of the Dead,Vault,0.042553,0.6,0.339995,0.532549
7,tt1712261,Army of the Dead,Triple 9,0.0,0.6,0.302364,0.452364
8,tt3110958,Army of the Dead,Now You See Me 2,0.0,0.2,0.398281,0.448281
9,tt1670345,Army of the Dead,Now You See Me,0.0,0.2,0.372027,0.422027


In [54]:
df_raw_sql = df.drop(['index', 'rating', 'numVotes',  'popular_rank', 'actors', 'categories'], axis=1)

In [55]:
write_raw_to_db(df_raw_sql)

In [56]:
sql_table = read_from_raw_db()
sql_table

Unnamed: 0,imdb_id,title,certificate,startYear,endYear,episodes,runtime,type,orign_country,language,plot,summary,genres,isAdult,cast,image_url
