In [11]:
import pandas as pd
from functools import reduce
import json
import psycopg2
import re
import sqlalchemy
import io
#import msgpack
#import msgpack_numpy as m

In [12]:
# load the data
df = pd.read_csv('./data/raw-data/netflix_list.csv')

In [13]:
df['type'].value_counts()

movie           2923
tvSeries        2199
tvEpisode        785
tvSpecial        391
tvMiniSeries     318
tvMovie          161
short            112
video            110
tvShort            6
videoGame          1
Name: type, dtype: int64

In [14]:
df = df[df['type'].isin(['movie', 'tvSeries', 'tvSpecial', 'tvMiniSeries', 'tvMovie '])]

# convert columns "director, listed_in, cast and country" in columns that contain a real list
# the strip function is applied on the elements
# if the value is NaN, the new column contains a empty list []
#df['categories'] = df['genres'].apply(lambda x: x.split(","))
#df['actors'] = df['cast'].apply(lambda x: x.split(","))
#df['countries'] = df['orign_country'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df['summary'].fillna(value="i", inplace=True)
df['plot'].fillna(value="i", inplace=True)
df['genres'].fillna(value="i", inplace=True)
df['cast'].fillna(value="i", inplace=True)

df['genres'] = df['genres'].replace(0, 'i')
df['cast'] = df['cast'].replace(0, 'i')

df = df.reset_index()
df.head()

Unnamed: 0,index,imdb_id,title,popular_rank,certificate,startYear,endYear,episodes,runtime,type,orign_country,language,plot,summary,rating,numVotes,genres,isAdult,cast,image_url
0,0,tt4052886,Lucifer,1,15,2016.0,,93.0,42,tvSeries,United States,English,Lucifer Morningstar has decided he's had enoug...,"Lucifer Morningstar, bored from his sulking li...",8.1,250884.0,"Crime,Drama,Fantasy",0,"['Tom Ellis', 'Lauren German', 'Lesley-Ann Bra...",https://m.media-amazon.com/images/M/MV5BNzY1Yj...
1,1,tt0993840,Army of the Dead,2,18,2021.0,,,148,movie,United States,English,"Following a zombie outbreak in Las Vegas, a gr...","With the abandoned, walled city of Las Vegas o...",5.8,110780.0,"Action,Crime,Horror",0,"['Dave Bautista', 'Ella Purnell', 'Ana de la R...",https://m.media-amazon.com/images/M/MV5BNGY0Nz...
2,2,tt7255502,The Kominsky Method,3,18,2018.0,2021.0,22.0,30,tvSeries,United States,English,"An aging actor, who long ago enjoyed a brush w...",Michael Douglas plays an actor who made it big...,8.2,28795.0,"Comedy,Drama",0,"['Michael Douglas', 'Sarah Baker', 'Graham Rog...",https://m.media-amazon.com/images/M/MV5BMzA0YT...
3,3,tt0108778,Friends,4,13+,1994.0,2004.0,235.0,22,tvSeries,United States,English,Follows the personal and professional lives of...,"Ross Geller, Rachel Green, Monica Geller, Joey...",8.9,861843.0,"Comedy,Romance",0,"['Jennifer Aniston', 'Courteney Cox', 'Lisa Ku...",https://m.media-amazon.com/images/M/MV5BNDVkYj...
4,4,tt9251798,Ragnarok,5,18,2020.0,,12.0,45,tvSeries,Norway,Norwegian,A small Norwegian town experiencing warm winte...,In the small fictional town of Edda coming of ...,7.5,26606.0,"Action,Drama,Fantasy",0,"['David Stakston', 'Jonas Strand Gravli', 'Her...",https://m.media-amazon.com/images/M/MV5BODM3NT...


## Uding TFIDF to find similar movies based on description

- Filter out keywords that occur only once.
- Convert every word to its stem so that words such as Dogs and Dog are considered the same.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk

stemmer = SnowballStemmer('english')
    
# Define a function to perform both stemming and tokenization
def tokenize_and_stem(text):
    
    # Tokenize by sentence, then by word
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    # Filter out raw tokens to remove noise
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    
    # Stem the filtered_tokens
    stems = [stemmer.stem(t) for t in filtered_tokens]
    
    return stems

# Build the tfidf matrix with the descriptions
text_content = df['plot']

vector = TfidfVectorizer(max_df=0.8, # drop words that occur in more than X percent of documents
                         max_features=200000,
                         min_df=3, # only use words that appear at least X times
                         stop_words='english', # remove stop words
                         lowercase=True, # Convert everything to lower case                          
                         tokenizer=tokenize_and_stem,
                         ngram_range=(1,3)
                        )

tfidf = vector.fit_transform(text_content)

  'stop_words.' % sorted(inconsistent))


In [16]:
cosine_similarities = cosine_similarity(tfidf,tfidf)

In [17]:
titles = df['title']
indices= pd.Series(df.index, index=df['title'])

def get_recommendations(title, cosine_sim, mode):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    movies = df.iloc[movie_indices]
    movies['scores'] = movie_scores
    movies['search_title'] = title

    if(mode == 'dev'):
        return movies[['imdb_id', 'search_title', 'title', 'scores', 'cast', 'genres', 'plot']]
    else:
        return movies[['imdb_id', 'search_title', 'title', 'scores']]


In [18]:
df_words = pd.DataFrame(vector.transform(text_content).todense(),
                   columns=vector.get_feature_names(), index=df.index)

In [19]:
df_words

Unnamed: 0,'d,'s,'s abil,'s anoth,'s belov,'s best,'s best friend,'s better,'s big,'s biggest,...,zack,zafer,zani,zealand,zhang,zhen,zombi,zombi apocalyps,zombi outbreak,zone
0,0.0,0.088773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.218266,0.0,0.276002,0.264483
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5826,0.0,0.139129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
5827,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
5828,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000
5829,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000


### House of Cards

In [20]:
get_recommendations('House of Cards', cosine_similarities, 'plot')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
254,tt1837642,House of Cards,Revenge,0.467497
1626,tt9815714,House of Cards,The Hard Way,0.377925
39,tt11937816,House of Cards,¿Quién Mató a Sara?,0.334838
1035,tt1534360,House of Cards,Ezel,0.293435
1574,tt12042964,House of Cards,The World of the Married,0.235956


In [21]:
df_words.transpose().sort_values(254, ascending=False).head(10).transpose().loc[254, :]

troubl young    0.389958
exact reveng    0.379454
exact           0.351788
emot            0.285247
reveng          0.280415
wrong           0.279281
young woman     0.269228
troubl          0.268332
father          0.217518
peopl           0.209492
Name: 254, dtype: float64

In [22]:
df_words.transpose().sort_values(147, ascending=False).head(10).transpose().loc[147, :]

exact reveng    0.454553
exact           0.421411
equal           0.410096
betray          0.355437
reveng          0.335913
wife            0.292848
peopl           0.250953
work            0.244346
'd              0.000000
plagu           0.000000
Name: 147, dtype: float64

### Narcos

In [23]:
get_recommendations('Narcos', cosine_similarities, 'plot')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
830,tt1355631,Narcos,The Infiltrator,0.473118
4326,tt14518692,Narcos,Suriname,0.255838
4920,tt1991034,Narcos,Dust Up,0.225698
928,tt1524137,Narcos,Contraband,0.222269
4783,tt5344086,Narcos,Chapo: el escape del siglo,0.221722


In [24]:
df_words.transpose().sort_values(830, ascending=False).head(10).transpose().loc[830, :]

escobar      0.319709
launder      0.319709
pablo        0.311098
custom       0.288415
drug lord    0.284349
colombian    0.284349
offici       0.266212
scheme       0.266212
lord         0.261667
u.s.         0.236006
Name: 830, dtype: float64

In [25]:
df_words.transpose().sort_values(4326, ascending=False).head(10).transpose().loc[4326, :]

base true stori    0.317990
man becom          0.310799
south america      0.304718
drug lord          0.290649
base true          0.286890
lord               0.267464
korean             0.265332
true stori         0.263308
south              0.221997
america            0.218681
Name: 4326, dtype: float64

### Okja

In [26]:
get_recommendations('Okja', cosine_similarities, 'plot')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
76,tt6474378,Okja,Good Girls,0.268818
2934,tt4466894,Okja,Sahara,0.235927
1657,tt3061050,Okja,Clarence,0.224364
2661,tt1870529,Okja,Won't Back Down,0.221068
3679,tt3263996,Okja,The Overnighters,0.209044


In [27]:
df_words.transpose().sort_values(2120, ascending=False).head(10).transpose().loc[2120, :]

lee            0.454380
smuggler       0.293650
south korea    0.272239
prosecutor     0.256146
kim            0.251282
underworld     0.249073
1970s          0.245022
public         0.238063
empir          0.235025
korea          0.232222
Name: 2120, dtype: float64

In [28]:
df_words.transpose().sort_values(642, ascending=False).head(10).transpose().loc[642, :]

multin          0.347437
risk everyth    0.323968
fascin          0.305013
beast           0.301365
prevent         0.279942
risk            0.274081
young girl      0.261563
kidnap          0.242363
compani         0.241516
everyth         0.227406
Name: 642, dtype: float64

### Using Count Vectorizer to find similar movies based on list of actors, directors, genres combined

In [29]:
df['actors'] = df['cast'].apply(lambda l: [] if pd.isna(l) else [re.sub('[^A-Za-z0-9]+', '', i) for i in l.split(",")])
#give high rating to top 3 actors
df['actors'] = df['actors'].apply(lambda x: [val + ',' + val if i<=2 else val for i,val in enumerate(x)])
df['actors'] = df['actors'].apply(lambda x: [val.split(',') for val in x])
df['actors'] = df['actors'].apply(lambda x: [item for sublist in x for item in sublist])
df['actors'] = df['actors'].apply(lambda x: ' '.join(x))

df['categories'] = df['genres'].apply(lambda l: [] if pd.isna(l) else [re.sub('[^A-Za-z0-9]+', '', i) for i in l.split(",")])
df['categories'] = df['categories'].apply(lambda x: ' '.join(x))

In [30]:
df['soup'] = df['actors'] + df['categories']
df['soup'].head()

0    TomEllis TomEllis LaurenGerman LaurenGerman Le...
1    DaveBautista DaveBautista EllaPurnell EllaPurn...
2    MichaelDouglas MichaelDouglas SarahBaker Sarah...
3    JenniferAniston JenniferAniston CourteneyCox C...
4    DavidStakston DavidStakston JonasStrandGravli ...
Name: soup, dtype: object

In [31]:
def countVectorizer(col):
    count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(df[col])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

In [32]:
cosine_sim = countVectorizer('soup')

In [33]:
get_recommendations('Okja', cosine_sim, 'soup')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
2490,tt14301644,Okja,BRZRKR,0.187867
3086,tt13802576,Okja,Lockwood & Co,0.187867
3121,tt9103802,Okja,John Henry and the Statesmen,0.187867
3208,tt13016388,Okja,The Three-Body Problem,0.187867
3421,tt0384580,Okja,The Talisman,0.187867


In [34]:
get_recommendations('House of Cards', cosine_sim, 'soup')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,scores
1453,tt7221090,House of Cards,Gore,0.106383
172,tt1210166,House of Cards,Moneyball,0.102126
1745,tt14181874,House of Cards,The Mothership,0.06083
280,tt4975722,House of Cards,Moonlight,0.042553
5362,tt4384904,House of Cards,The 21st Annual Screen Actors Guild Awards,0.042553


### Combine each description, actors, genres similarity score to derive a total score

In [35]:
def improved_recommendations(title, mode):
    df_actor_category = get_recommendations(title, cosine_sim, "").rename(columns={'scores': 'actor_category_comb_score'})
    df_description = get_recommendations(title, cosine_similarities, mode).rename(columns={'scores': 'description_score'})
    data_frames = [df_actor_category, df_description]
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['search_title', 'title', 'imdb_id'], how='outer'), data_frames)
    df_merged = df_merged.fillna(0)
    df_merged['scores'] = df_merged['actor_category_comb_score'] + df_merged['description_score'] 
    df_merged = df_merged.sort_values('scores', ascending=False).reset_index(drop=True).head(5)
    
    # extract image url
    #for i,row in df_merged.iterrows():
        #image = df_photosAll[(df_photosAll['show_id'] == row.show_id)]['url']
        #df_merged.loc[i, 'image'] = image
        
    return df_merged


In [36]:
test1 = improved_recommendations('House of Cards', 'dev')
test1.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,actor_category_comb_score,description_score,cast,genres,plot,scores
0,tt1837642,House of Cards,Revenge,0.0,0.467497,"['Madeleine Stowe', 'Emily VanCamp', 'Gabriel ...","Drama,Mystery,Thriller",An emotionally troubled young woman sets out t...,0.467497
1,tt9815714,House of Cards,The Hard Way,0.0,0.377925,"['Michael Jai White', 'Luke Goss', 'Randy Cout...",Action,After learning his brother died on a mission i...,0.377925
2,tt11937816,House of Cards,¿Quién Mató a Sara?,0.0,0.334838,"['Manolo Cardona', 'Ginés García Millán', 'Car...","Crime,Drama,Mystery",Hell-bent on exacting revenge and proving he w...,0.334838
3,tt1534360,House of Cards,Ezel,0.0,0.293435,"['Kenan Imirzalioglu', 'Cansu Dere', 'Yigit Öz...","Crime,Drama,Thriller",Betrayed by his trusted friends and the woman ...,0.293435
4,tt12042964,House of Cards,The World of the Married,0.0,0.235956,"['Kim Hee-ae', 'Park Hae-Joon', 'So-hee Han', ...","Drama,Romance",A story about a married couple whose betrayal ...,0.235956


In [37]:
test2 = improved_recommendations('Okja', 'dev')
test2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,actor_category_comb_score,description_score,cast,genres,plot,scores
0,tt6474378,Okja,Good Girls,0.0,0.268818,"['Christina Hendricks', 'Retta', 'Mae Whitman'...","Comedy,Crime,Drama",Three suburban mothers suddenly find themselve...,0.268818
1,tt4466894,Okja,Sahara,0.0,0.235927,"['Omar Sy', 'Louane Emera', 'Franck Gastambide...","Adventure,Animation,Comedy",A young cobra and his scorpion best friend go ...,0.235927
2,tt3061050,Okja,Clarence,0.0,0.224364,"['Spencer Rothbell', 'Katie Crown', 'Tom Kenny...","Adventure,Animation,Comedy",The adventures of a 4th grader named Clarence ...,0.224364
3,tt1870529,Okja,Won't Back Down,0.0,0.221068,"['Viola Davis', 'Maggie Gyllenhaal', 'Holly Hu...",Drama,"Two determined mothers­, one a teacher, look t...",0.221068
4,tt3263996,Okja,The Overnighters,0.0,0.209044,"['Jay Reinke', 'Andrea Reinke', 'Alan Mezo', '...","Documentary,Drama","Broken, desperate men chase their dreams and r...",0.209044


In [38]:
test3 = improved_recommendations('Lucifer', 'dev')
test3.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,actor_category_comb_score,description_score,cast,genres,plot,scores
0,tt1220627,Lucifer,Hard Breakers,0.0,0.286566,"['Cameron Richardson', 'Sophie Monk', 'Chris K...",Comedy,A comedy centered on two single girls working ...,0.286566
1,tt7913450,Lucifer,Soundtrack,0.0,0.262461,"['Paul James', 'Callie Hernandez', 'Marianne J...","Drama,Musical",Music connects the lives of random people livi...,0.262461
2,tt2249007,Lucifer,Ray Donovan,0.0,0.242244,"['Liev Schreiber', 'Eddie Marsan', 'Dash Mihok...","Crime,Drama","Ray Donovan, a professional ""fixer"" for the ri...",0.242244
3,tt2507238,Lucifer,Me Him Her,0.0,0.239014,"['Dustin Milligan', 'Luke Bracey', 'Emily Mead...","Comedy,Romance",Vicenarian drifter Cory arrives in Los Angeles...,0.239014
4,tt13357124,Lucifer,Gokushufudo,0.0,0.230935,"['Kenjirô Tsuda', 'Shizuka Itô', 'Kazuyuki Oki...","Action,Animation,Comedy",A retired gangster spends his time as a househ...,0.230935


In [39]:
test3 = improved_recommendations('The Kominsky Method', 'dev')
test3.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdb_id,search_title,title,actor_category_comb_score,description_score,cast,genres,plot,scores
0,tt5324116,The Kominsky Method,Swedish Dicks,0.0,0.208895,"['Peter Stormare', 'Johan Glans', 'Vivian Bang...","Comedy,Crime",Two unlicensed Swedish private investigators t...,0.208895
1,tt4685750,The Kominsky Method,Much Loved,0.0,0.198461,"['Loubna Abidar', 'Asmaa Lazrak', 'Halima Kara...",Drama,A group of women in Morocco make a living as p...,0.198461
2,tt8690728,The Kominsky Method,Goblin Slayer,0.0,0.173938,"['Yuichiro Umehara', 'Yui Ogura', 'Hayden Davi...","Action,Adventure,Animation","In a fantasy world, a lone hero makes his livi...",0.173938
3,tt6708116,The Kominsky Method,Mauvaises herbes,0.0,0.172227,"['Kheiron', 'Catherine Deneuve', 'André Dussol...",Comedy,"Wael (Kheiron) a former street child, makes a ...",0.172227
4,tt12930602,The Kominsky Method,The Playbook,0.0,0.162706,"['Jill Ellis', 'Patrick Mouratoglou', 'José Mo...","Documentary,Sport",The Playbook profiles legendary coaches as the...,0.162706


In [40]:
result = pd.DataFrame()
for i in df['title'].unique()[0: 3]:
    frames = improved_recommendations(i, 'prod')
    result = pd.concat([result, frames])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [56]:
from dotenv import load_dotenv
load_dotenv()

import os

DATABASE_URL = 'postgresql+psycopg2://' + os.environ['DB_USER'] + ':' + os.environ['DB_PASSWORD']  + '@' + os.environ['DB_HOST'] + ':5432/' + os.environ['DB']
print(DATABASE_URL)

postgresql+psycopg2://gphliyrfcdlsmy@634a2ca99b57e6e5b2e771189219048ce85c9451eb5b6ad1b7c1164b444a2dba@ec2-35-153-114-74.compute-1.amazonaws.com:5432/dvff4i4r4j7ni


In [57]:
def write_to_db(df):
    """ insert a new df into the results table """
    try:

        engine = sqlalchemy.create_engine(DATABASE_URL)

        df.to_sql('results', engine, if_exists='replace',index=False) #drops old table and creates new empty table

    except (Exception, psycopg2.DatabaseError) as e:
        print(e)

def read_from_db():
    """ query data from the results table """
    try:
        engine = sqlalchemy.create_engine(DATABASE_URL)
        return pd.read_sql("SELECT * FROM results LIMIT 10", engine)
        
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)


def write_raw_to_db(df):
    """ insert a new df into the results table """
    try:

        engine = sqlalchemy.create_engine(DATABASE_URL)

        df.to_sql('titles', engine, if_exists='replace',index=False) #drops old table and creates new empty table

    except (Exception, psycopg2.DatabaseError) as e:
        print(e)

In [58]:
write_to_db(result)

(psycopg2.OperationalError) fe_sendauth: no password supplied

(Background on this error at: https://sqlalche.me/e/14/e3q8)


In [55]:
sql_table = read_from_db()
sql_table

(psycopg2.OperationalError) fe_sendauth: no password supplied

(Background on this error at: https://sqlalche.me/e/14/e3q8)


In [45]:
df_raw_sql = df.drop(['index', 'rating', 'numVotes',  'popular_rank', 'actors', 'categories', 'soup'], axis=1)

In [46]:
write_raw_to_db(df_raw_sql)