In [2]:
import pandas as pd
import pickle
from functools import reduce
import json
import requests
from bs4 import BeautifulSoup
import pickle

In [2]:
# load the data
df = pd.read_csv('./data/netflix_titles.csv')
# convert to datetime
df["date_added"] = pd.to_datetime(df['date_added'])
df['year'] = df['date_added'].dt.year
df['month'] = df['date_added'].dt.month
df['day'] = df['date_added'].dt.day
# convert columns "director, listed_in, cast and country" in columns that contain a real list
# the strip function is applied on the elements
# if the value is NaN, the new column contains a empty list []
df['directors'] = df['director'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df['categories'] = df['listed_in'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df['actors'] = df['cast'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])
df['countries'] = df['country'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

df = df.reset_index()
df.head()

Unnamed: 0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year,month,day,directors,categories,actors,countries
0,0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,2019.0,9.0,9.0,"[Richard Finn, Tim Maltby]","[Children & Family Movies, Comedies]","[Alan Marriott, Andrew Toth, Brian Dobson, Col...","[United States, India, South Korea, China]"
1,1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,2016-09-09,2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,2016.0,9.0,9.0,[],[Stand-Up Comedy],[Jandino Asporaat],[United Kingdom]
2,2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09-08,2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob...",2018.0,9.0,8.0,[],[Kids' TV],"[Peter Cullen, Sumalee Montano, Frank Welker, ...",[United States]
3,3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,2018-09-08,2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...,2018.0,9.0,8.0,[],[Kids' TV],"[Will Friedle, Darren Criss, Constance Zimmer,...",[United States]
4,4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09-08,2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...,2017.0,9.0,8.0,[Fernando Lebrija],[Comedies],"[Nesta Cooper, Kate Walsh, John Michael Higgin...",[United States]


#### Should i only filter the top 3 actors that appear in the credits list?

In [3]:
for i in df[df['title'] == 'House of Cards'].actors:
    print(i)

['Kevin Spacey', 'Robin Wright', 'Kate Mara', 'Corey Stoll', 'Sakina Jaffrey', 'Kristen Connolly', 'Constance Zimmer', 'Sebastian Arcelus', 'Nathan Darrow', 'Sandrine Holt', 'Michel Gill', 'Elizabeth Norment', 'Mahershala Ali', 'Reg E. Cathey', 'Molly Parker', 'Derek Cecil', 'Elizabeth Marvel', 'Kim Dickens', 'Lars Mikkelsen', 'Michael Kelly', 'Joel Kinnaman', 'Campbell Scott', 'Patricia Clarkson', 'Neve Campbell']


In [4]:
for i in df[df['title'] == 'Stranger'].actors:
    print(i)

['Seung-woo Cho', 'Doona Bae', 'Joon-hyuk Lee', 'Kyeong-yeong Lee', 'Jae-myung Yoo', 'Hye-sun Shin']


In [5]:
for i in df[df['title'] == 'Inception'].actors:
    print(i)

['Leonardo DiCaprio', 'Joseph Gordon-Levitt', 'Ellen Page', 'Tom Hardy', 'Ken Watanabe', 'Dileep Rao', 'Cillian Murphy', 'Tom Berenger', 'Marion Cotillard', 'Pete Postlethwaite', 'Michael Caine', 'Lukas Haas']


In [6]:
for i in df[df['title'] == 'Terrace House: Boys & Girls in the City'].actors:
    print(i)

['You', 'Reina Triendl', 'Ryota Yamasato', 'Yoshimi Tokui', 'Azusa Babazono', 'Ayumu Mochizuki', 'Kentaro']


## Uding TFIDF to find similar movies based on description

- Filter out keywords that occur only once.
- Convert every word to its stem so that words such as Dogs and Dog are considered the same.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stemmer = SnowballStemmer('english')
    
# Build the tfidf matrix with the descriptions
text_content = df['description']
vector = TfidfVectorizer(max_df=0.4,         # drop words that occur in more than X percent of documents
                             min_df=1,      # only use words that appear at least X times
                             stop_words='english', # remove stop words
                             lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True # Prevents divide-by-zero errors
                            )
tfidf = vector.fit_transform(text_content)

In [8]:
cosine_similarities = cosine_similarity(tfidf,tfidf)

In [9]:
titles = df['title']
indices= pd.Series(df.index, index=df['title'])

def get_recommendations(title, cosine_sim, col):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    movies = df.iloc[movie_indices]
    movies['scores'] = movie_scores
    if(col == 'soup'):
        return movies[['index', 'show_id', 'title', 'actors', 'directors', 'categories', 'scores']]
    else:
        return movies[['index', 'show_id', 'title', col, 'scores']]

In [10]:
df_words = pd.DataFrame(vector.transform(text_content).todense(),
                   columns=vector.get_feature_names(), index=df.index)

### Stranger

In [11]:
get_recommendations('Stranger', cosine_similarities, 'description')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,description,scores
3959,3959,80144372,Asura: The City of Madness,Caught between a corrupt mayor and a prosecuto...,0.20311
2899,2899,80155863,City of Tiny Lights,"In moody London, a burned-out sleuth sinks int...",0.186154
2079,2079,80222788,Day and Night,A detective assists with an investigation into...,0.159067
1700,1700,80121973,Behzat Ç.,An Ankara homicide detective with a dark perso...,0.149908
4643,4643,81149202,Why Me?,A young prosecutor is assigned a career-making...,0.149633
1971,1971,80174617,Borderliner,"To protect his family, a police detective cove...",0.142091
6033,6033,80142550,The Break,A police detective mourning a painful loss mov...,0.141524
685,685,80992769,Dark Crimes,A detective on a cold murder case discovers th...,0.132677
4543,4543,80218783,Mantra,An Indian business owner struggles to keep his...,0.131198
1773,1773,81053958,Backdraft 2,"In this sequel to the 1991 film, a determined ...",0.128617


In [12]:
df_words.transpose().sort_values(1514, ascending=False).head(10).transpose().loc[1514, :]

gutsy         0.342910
empathy       0.322635
feel          0.299304
prosecutor    0.299304
ability       0.284709
tackles       0.281767
corruption    0.242134
lost          0.237227
amid          0.234551
female        0.233697
Name: 1514, dtype: float64

In [13]:
df.loc[1514, :].description

'With the help of a gutsy female detective, a prosecutor who has lost the ability to feel empathy tackles a murder case amid political corruption.'

In [14]:
df.loc[3959, :].description

"Caught between a corrupt mayor and a prosecutor intent on exposing political depravity, a police detective must decide whose side he's on."

### Narcos

In [15]:
get_recommendations('Narcos', cosine_similarities, 'description')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,description,scores
1583,1583,80100933,Miss Dynamite,"Wealthy, beautiful Valentina falls in love, on...",0.243176
4857,4857,80035689,El Cartel 2,Drug trafficker Pepe Cadena navigates the trea...,0.218326
1257,1257,80997085,Narcos: Mexico,Witness the birth of the Mexican drug war in t...,0.202732
5939,5939,80133042,El Chapo,This drama series chronicles the true story of...,0.189741
5773,5773,70236561,Top Boy,"In this gritty, stylish drama series, two Lond...",0.189012
5162,5162,80106150,Cocaine,Three films chronicle the cocaine trade's swee...,0.178188
732,732,81034012,Street Flow,"Three brothers – a gangster, a scholar and an ...",0.162791
1480,1480,70303431,Raja Natwarlal,A small-time con man assembles a team to help ...,0.155327
1833,1833,81021631,Two Graves,A doctor and a drug addict kidnap the son of a...,0.143659
3444,3444,80125593,Historia de un clan,This drama based on a true story follows the P...,0.141224


In [16]:
df_words.transpose().sort_values(5637, ascending=False).head(10).transpose().loc[5637, :]

infamously    0.385197
fuels         0.355677
cartels       0.346174
colombia      0.321142
gritty        0.308890
gangster      0.275962
violent       0.251663
powerful      0.239644
drama         0.217334
drug          0.217334
Name: 5637, dtype: float64

In [17]:
df.loc[5637, :].description

"The true story of Colombia's infamously violent and powerful drug cartels fuels this gritty gangster drama series."

In [19]:
df.loc[1583, :].description

"Wealthy, beautiful Valentina falls in love, only to realize that her man and her family are involved with one of Mexico's most powerful drug cartels."

In [21]:
df.loc[1257, :].description

'Witness the birth of the Mexican drug war in the 1980s as a gritty new "Narcos" saga chronicles the true story of the Guadalajara cartel\'s ascent.'

### House of Cards

In [23]:
get_recommendations('House of Cards', cosine_similarities, 'description')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,description,scores
5789,5789,70140358,Arrested Development,It's the Emmy-winning story of a wealthy famil...,0.226158
5607,5607,70210884,American Horror Story,This twisted Emmy-winning drama plays upon the...,0.185595
5265,5265,80154638,President,"An ambitious, talented politician embarks on a...",0.152871
3402,3402,80158543,Alibaba Aur 40 Chor,A simple village man is thrown into a web of p...,0.151945
4049,4049,80141928,Before the Flood,Leonardo DiCaprio crisscrosses the globe to in...,0.150906
5873,5873,80113647,Designated Survivor,America's fate rests in the hands of a low-lev...,0.148259
5942,5942,70242311,Orange Is the New Black,A privileged New Yorker ends up in a women's p...,0.148125
5501,5501,70060380,A Dangerous Woman,At the center of this engrossing melodrama is ...,0.142923
1123,1123,80126449,Cuba and the Cameraman,Emmy-winning filmmaker Jon Alpert chronicles t...,0.140332
211,211,70129452,Louis C.K.: Hilarious,Emmy-winning comedy writer Louis C.K. brings h...,0.139787


In [24]:
df.loc[5741, :].description

'A ruthless politician will stop at nothing to conquer Washington, D.C., in this Emmy and Golden Globe-winning political drama.'

In [26]:
df.loc[5789, :].description

"It's the Emmy-winning story of a wealthy family that lost everything, and the one son who had no choice but to keep them all together."

In [28]:
df.loc[4049, :].description

'Leonardo DiCaprio crisscrosses the globe to investigate the consequences of man-made globe warming and the measures being taken to reverse it.'

### Inception

In [30]:
get_recommendations('Inception', cosine_similarities, 'description')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,description,scores
4994,4994,81122206,Til Death Do Us Part,Based on the speculative short stories of Mirr...,0.185504
1924,1924,70176655,Apollo 18,"This sci-fi thriller suggests that Apollo 17, ...",0.181247
5875,5875,70264888,Black Mirror,This sci-fi anthology series explores a twiste...,0.149816
329,329,81154956,Transformers: Cyberverse,Optimus Prime and the AllSpark are missing – a...,0.146768
3939,3939,81067758,Candyflip,"On the incandescent shores of Goa, a young man...",0.146739
1879,1879,80176698,Abby Sen,"After losing his seventh job in a row, a TV pr...",0.144639
3891,3891,80097140,Altered Carbon,"After 250 years on ice, a prisoner returns to ...",0.142263
5892,5892,80100172,Dark,A missing child sets four families on a franti...,0.138113
150,150,80124522,Maniac,Two struggling strangers connect during a mind...,0.135614
563,563,70295169,Hunter's Prayer,A hired gun has second thoughts when he's cont...,0.127074


In [31]:
df_words.transpose().sort_values(3524, ascending=False).head(10).transpose().loc[3524, :]

subconscious    0.319833
entering        0.319833
mold            0.305496
espionage       0.275534
thoughts        0.270812
fi              0.266647
sci             0.266647
bending         0.262922
built           0.262922
targets         0.253644
Name: 3524, dtype: float64

In [32]:
df.loc[3524, :].description

'In this mind-bending sci-fi thriller, a man runs an espionage business built around entering the subconscious of his targets to mold their thoughts.'

In [34]:
df.loc[4994, :].description

'Based on the speculative short stories of MirrorFiction, this sci-fi thriller anthology plunges headlong into our deepest desires – and darkest fears.'

In [36]:
df.loc[5875, :].description

"This sci-fi anthology series explores a twisted, high-tech near-future where humanity's greatest innovations and darkest instincts collide."

### Using Count Vectorizer to find similar movies based on list of actors, directors, genres combined

In [38]:
df['actors'] = df['actors'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
#give high rating to top 3 actors
df['actors'] = df['actors'].apply(lambda x: [val + ',' + val if i<=2 else val for i,val in enumerate(x)])
df['actors'] = df['actors'].apply(lambda x: [val.split(',') for val in x])
df['actors'] = df['actors'].apply(lambda x: [item for sublist in x for item in sublist])
df['actors'] = df['actors'].apply(lambda x: ' '.join(x))

In [39]:
df['actors'][0]

'alanmarriott alanmarriott andrewtoth andrewtoth briandobson briandobson colehoward jennifercameron jonathanholmes leetockar lisadurupt mayakay michaeldobson'

In [40]:
df['directors'] = df['directors'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
df['directors'] = df['directors'].apply(lambda x: x * 3) #give higher rating to directors
df['directors'] = df['directors'].apply(lambda x: ' '.join(x))

In [41]:
df['directors'][0]

'richardfinn timmaltby richardfinn timmaltby richardfinn timmaltby'

In [42]:
df['categories'] = df['categories'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
df['categories'] = df['categories'].apply(lambda x: ' '.join(x))

In [43]:
df['soup'] = df['actors'] + df['directors'] + df['categories']
#df['soup'] = df['soup'].apply(lambda x: ' '.join(x))

In [44]:
def countVectorizer(col):
    count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(df[col])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

In [45]:
cosine_sim = countVectorizer('soup')

In [46]:
get_recommendations('Okja', cosine_sim, 'soup')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,actors,directors,categories,scores
4905,4905,80211623,Otherhood,,,comedies,0.145865
1196,1196,70044686,Zodiac,markruffalo markruffalo jakegyllenhaal jakegyl...,davidfincher davidfincher davidfincher,cultmovies dramas thrillers,0.140143
1128,1128,70235384,End of Watch,jakegyllenhaal jakegyllenhaal michaelpeña mich...,davidayer davidayer davidayer,action&adventure,0.136682
2958,2958,70293661,Enemy,jakegyllenhaal jakegyllenhaal mélanielaurent m...,denisvilleneuve denisvilleneuve denisvilleneuve,independentmovies thrillers,0.135432
1828,1828,70270364,Snowpiercer,chrisevans chrisevans songkang-ho songkang-ho ...,bongjoonho bongjoonho bongjoonho,action&adventure cultmovies internationalmovies,0.135242
2784,2784,80176715,The Legacy of a Whitetail Deer Hunter,joshbrolin joshbrolin dannymcbride dannymcbrid...,jodyhill jodyhill jodyhill,action&adventure comedies dramas,0.130991
4042,4042,80199689,Velvet Buzzsaw,jakegyllenhaal jakegyllenhaal renerusso reneru...,dangilroy dangilroy dangilroy,dramas thrillers,0.126959
4069,4069,80017528,Happy New Year,shahrukhkhan shahrukhkhan deepikapadukone deep...,farahkhan farahkhan farahkhan,action&adventure comedies dramas,0.126959
4308,4308,80194671,The Climb,ahmedsylla ahmedsylla alicebelaïdi alicebelaïd...,ludovicbernard ludovicbernard ludovicbernard,action&adventure comedies dramas,0.126959
5188,5188,81091424,Petta (Telugu Version),rajnikanth rajnikanth vijaysethupathi vijayset...,karthiksubbaraj karthiksubbaraj karthiksubbaraj,action&adventure comedies dramas,0.126959


In [47]:
get_recommendations('The Matrix', cosine_sim, 'soup')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,actors,directors,categories,scores
1423,1423,60027695,The Matrix Reloaded,keanureeves keanureeves laurencefishburne laur...,lillywachowski lanawachowski lillywachowski la...,action&adventure sci-fi&fantasy,0.784987
1424,1424,60031303,The Matrix Revolutions,keanureeves keanureeves laurencefishburne laur...,lillywachowski lanawachowski lillywachowski la...,action&adventure sci-fi&fantasy,0.781345
5064,5064,70301367,Jupiter Ascending,milakunis milakunis channingtatum channingtatu...,lanawachowski lillywachowski lanawachowski lil...,action&adventure sci-fi&fantasy,0.306491
2556,2556,80231601,The Darkest Dawn,,,action&adventure internationalmovies sci-fi&fa...,0.227921
5131,5131,70248183,Cloud Atlas,tomhanks tomhanks halleberry halleberry jimbro...,lillywachowski lanawachowski tomtykwer lillywa...,action&adventure cultmovies dramas,0.209329
324,324,70043303,The Lake House,keanureeves keanureeves sandrabullock sandrabu...,alejandroagresti alejandroagresti alejandroagr...,dramas romanticmovies sci-fi&fantasy,0.19676
3507,3507,1179574,Event Horizon,laurencefishburne laurencefishburne samneill s...,paulw.s.anderson paulw.s.anderson paulw.s.ande...,horrormovies sci-fi&fantasy,0.176419
1500,1500,80205563,Golden Time,,takuyainaba takuyainaba takuyainaba,internationalmovies sci-fi&fantasy,0.174714
2855,2855,70278930,Man of Tai Chi,keanureeves keanureeves tigerchen tigerchen ka...,keanureeves keanureeves keanureeves,action&adventure dramas,0.161394
824,824,60001761,Supergirl,fayedunaway fayedunaway hartbochner hartbochne...,jeannotszwarc jeannotszwarc jeannotszwarc,action&adventure sci-fi&fantasy,0.158397


### Using Count Vectorizer to find similar movies based on each actors, directors, genres similarity score

#### Actors

In [48]:
cosine_sim_actors = countVectorizer('actors')

In [49]:
get_recommendations('The Matrix', cosine_sim_actors, 'cast')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,cast,scores
1423,1423,60027695,The Matrix Reloaded,"Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",0.683741
1424,1424,60031303,The Matrix Revolutions,"Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",0.675211
2566,2566,80128245,Brain on Fire,"Chloë Grace Moretz, Thomas Mann, Richard Armit...",0.1849
4584,4584,70243446,Silent Hill: Revelation,"Deborah Kara Unger, Adelaide Clemens, Sean Bea...",0.17841
5915,5915,80002311,Marvel's Jessica Jones,"Krysten Ritter, David Tennant, Rachael Taylor,...",0.172559
5491,5491,80103336,The Bye Bye Man,"Douglas Smith, Lucien Laviscount, Cressida Bon...",0.167248
140,140,80099204,The Bad Batch,"Suki Waterhouse, Jason Momoa, Keanu Reeves, Ji...",0.166945
2997,2997,70300666,The Signal,"Brenton Thwaites, Olivia Cooke, Laurence Fishb...",0.166945
4600,4600,80065754,Standoff,"Thomas Jane, Laurence Fishburne, Joanna Dougla...",0.160128
2855,2855,70278930,Man of Tai Chi,"Keanu Reeves, Tiger Chen, Karen Mok, Simon Yam...",0.154083


#### Directors

In [50]:
cosine_sim_director = countVectorizer('directors')

In [51]:
get_recommendations('The Matrix', cosine_sim_director, 'directors')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,directors,scores
1423,1423,60027695,The Matrix Reloaded,lillywachowski lanawachowski lillywachowski la...,1.0
1424,1424,60031303,The Matrix Revolutions,lillywachowski lanawachowski lillywachowski la...,1.0
5064,5064,70301367,Jupiter Ascending,lanawachowski lillywachowski lanawachowski lil...,0.967742
5131,5131,70248183,Cloud Atlas,lillywachowski lanawachowski tomtykwer lillywa...,0.692763
0,0,81145628,Norm of the North: King Sized Adventure,richardfinn timmaltby richardfinn timmaltby ri...,0.0
1,1,80117401,Jandino: Whatever it Takes,,0.0
2,2,70234439,Transformers Prime,,0.0
3,3,80058654,Transformers: Robots in Disguise,,0.0
4,4,80125979,#realityhigh,fernandolebrija fernandolebrija fernandolebrija,0.0
5,5,80163890,Apaches,,0.0


#### Categories

In [53]:
cosine_sim_categories = countVectorizer('categories')

In [54]:
get_recommendations('Stranger', cosine_sim_categories, 'categories')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,index,show_id,title,categories,scores
821,821,80987077,Signal,crimetvshows internationaltvshows koreantvshows,1.0
843,843,81167047,Tunnel,crimetvshows internationaltvshows koreantvshows,1.0
1121,1121,80198001,Strong Girl Bong-soon,crimetvshows internationaltvshows koreantvshows,1.0
1514,1514,80187302,Stranger,crimetvshows internationaltvshows koreantvshows,1.0
1567,1567,81087762,Abyss,crimetvshows internationaltvshows koreantvshows,1.0
1678,1678,80098047,Last,crimetvshows internationaltvshows koreantvshows,1.0
1952,1952,80214772,Bad Guys: Vile City,crimetvshows internationaltvshows koreantvshows,1.0
2243,2243,80094387,A Man Called God,crimetvshows internationaltvshows koreantvshows,1.0
2688,2688,80176866,Man to Man,crimetvshows internationaltvshows koreantvshows,1.0
2689,2689,80162114,My Little Baby,crimetvshows internationaltvshows koreantvshows,1.0


### Combine each description, actors, directors, genres similarity score to derive a total score

In [231]:
def improved_recommendations(title):
    df_actors = get_recommendations(title, cosine_sim_actors, 'actors').rename(columns={'scores': 'actor_score'})
    df_directors = get_recommendations(title, cosine_sim_director, 'directors').rename(columns={'scores': 'director_score'})
    df_categories = get_recommendations(title, cosine_sim_categories, 'categories').rename(columns={'scores': 'category_score'})
    df_description = get_recommendations(title, cosine_similarities, 'description').rename(columns={'scores': 'description_score'})
    data_frames = [df_actors, df_directors, df_categories, df_description]
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['index', 'title', 'show_id'], how='outer'), data_frames)
    df_merged = df_merged.fillna(0)
    df_merged['scores'] = df_merged['actor_score'] + df_merged['director_score'] + df_merged['category_score']*0.25 + df_merged['description_score'] 
    df_merged = df_merged.sort_values('scores', ascending=False).reset_index(drop=True).head(10)
    
    # extract image url
    for i,row in df_merged.iterrows():
        image = df_photosAll[(df_photosAll['show_id'] == row.show_id)]['url']
        df_merged.loc[i, 'image'] = image
        
    return df_merged


In [224]:
inception = improved_recommendations('Inception')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [225]:
inception.head(10)

Unnamed: 0,index,show_id,title,actors,actor_score,directors,director_score,categories,category_score,description,description_score,scores
0,335,70024088,Brick,josephgordon-levitt josephgordon-levitt noraze...,0.357244,0,0.0,0,0.0,0,0.0,0.357244
1,4049,80141928,Before the Flood,leonardodicaprio leonardodicaprio,0.340997,0,0.0,0,0.0,0,0.0,0.340997
2,2696,70202141,50/50,josephgordon-levitt josephgordon-levitt sethro...,0.309612,0,0.0,0,0.0,0,0.0,0.309612
3,3504,60029363,Dragonheart,seanconnery seanconnery dennisquaid dennisquai...,0.029348,0,0.0,action&adventure sci-fi&fantasy,0.904534,0,0.0,0.255482
4,3524,70131314,Inception,0,0.0,0,0.0,action&adventure sci-fi&fantasy thrillers,1.0,0,0.0,0.25
5,276,81023636,Time Trap,0,0.0,0,0.0,action&adventure sci-fi&fantasy,0.904534,0,0.0,0.226134
6,3084,70114342,The Book of Eli,0,0.0,0,0.0,action&adventure sci-fi&fantasy,0.904534,0,0.0,0.226134
7,2675,80176756,Dragonheart: Battle for the Heartfire,0,0.0,0,0.0,action&adventure sci-fi&fantasy,0.904534,0,0.0,0.226134
8,2302,70111265,Hulk Vs.,0,0.0,0,0.0,action&adventure sci-fi&fantasy,0.904534,0,0.0,0.226134
9,1997,80176592,Revolt,0,0.0,0,0.0,action&adventure sci-fi&fantasy,0.904534,0,0.0,0.226134


In [120]:
matrix = improved_recommendations('The Matrix')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [212]:
matrix.head(10)

Unnamed: 0,index,show_id,title,actors,actor_score,directors,director_score,categories,category_score,description,description_score,scores
0,1423,60027695,The Matrix Reloaded,keanureeves keanureeves laurencefishburne laur...,0.683741,lillywachowski lanawachowski lillywachowski la...,1.0,action&adventure sci-fi&fantasy,1.0,"The crew must protect Zion, the last outpost s...",0.108047,2.041787
1,1424,60031303,The Matrix Revolutions,keanureeves keanureeves laurencefishburne laur...,0.675211,lillywachowski lanawachowski lillywachowski la...,1.0,action&adventure sci-fi&fantasy,1.0,0,0.0,1.925211
2,5064,70301367,Jupiter Ascending,0,0.0,lanawachowski lillywachowski lanawachowski lil...,0.967742,0,0.0,0,0.0,0.967742
3,5131,70248183,Cloud Atlas,0,0.0,lillywachowski lanawachowski tomtykwer lillywa...,0.692763,0,0.0,0,0.0,0.692763
4,2675,80176756,Dragonheart: Battle for the Heartfire,0,0.0,0,0.0,action&adventure sci-fi&fantasy,1.0,0,0.0,0.25
5,3085,70021661,The Brothers Grimm,0,0.0,0,0.0,action&adventure sci-fi&fantasy,1.0,0,0.0,0.25
6,3084,70114342,The Book of Eli,0,0.0,0,0.0,action&adventure sci-fi&fantasy,1.0,0,0.0,0.25
7,3569,60029154,Terminator 3: Rise of the Machines,0,0.0,0,0.0,action&adventure sci-fi&fantasy,1.0,0,0.0,0.25
8,3519,11819467,Godzilla,0,0.0,0,0.0,action&adventure sci-fi&fantasy,1.0,0,0.0,0.25
9,3517,70044594,Ghost Rider,0,0.0,0,0.0,action&adventure sci-fi&fantasy,1.0,0,0.0,0.25


### Extract thumbnails of movie or TV show posters

In [240]:
def getImagesFromFlixable(show_id):

    URL = 'https://flixable.com/title/' + str(show_id) + '/'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.findAll("img")
    
    return {'show_id':show_id, 'url': results[0]['data-src']}

In [245]:
#dict_photos = []
#for i in df[5000:6234].show_id:
    #dict_photos.append(getImagesFromFlixable(i))
#df_photosAll = pd.DataFrame(dict_photos)

In [246]:
#outfile = open('./data/photo_url_6000.pkl','wb')
#pickle.dump(df_photosAll, outfile)
#outfile.close()