In [10]:
import pandas as pd
import numpy as np

import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter

In [11]:
nlp = spacy.load("en_core_web_lg")

In [12]:
#function extracts the 10 most common verbs and nouns from each collection of user_plots 
def extract(doc):
    key_words = []
    stopwords = list(STOP_WORDS)
    pos_tag = ['NOUN', 'VERB']

    for token in doc:
        if token.text in stopwords or token.text in punctuation:
            continue
        if token.pos_ in pos_tag:
            key_words.append(token.text)
            
    freq_word = Counter(key_words)
    freq_tups = freq_word.most_common(n=10)
    
    new_keywords = [word[0] for word in freq_tups]
    
    freq_string = ''
    for word in new_keywords:
        freq_string += word + ' ' 
    
    return freq_string

In [56]:
movies = pd.read_csv('../data/genre_groups/master_genre_df.csv')
movies.head(2)

Unnamed: 0,imdb_id,title,overviews,join_director,join_cast,vote_average,vote_count,genre,user_plots,synopsis,...,Fantasy,Mystery,Thriller,Western,Family,Sci-Fi,Romance,Film-Noir,Musical,Music
0,tt0111161,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,FrankDarabont,TimRobbins MorganFreeman BobGunton,8.7,18845,Drama,['Two imprisoned men bond over a number of yea...,"In 1947, Andy Dufresne (Tim Robbins), a banker...",...,0,0,0,0,0,0,0,0,0,0
1,tt0068646,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",FrancisFordCoppola,AlPacino MarlonBrando JamesCaan,8.7,14225,Crime Drama,"[""An organized crime dynasty's aging patriarch...","In late summer 1945, guests are gathered for t...",...,0,0,0,0,0,0,0,0,0,0


In [55]:
#list of original user_plots
user_plots = [plot for plot in movies['user_plots']]

#list of movie titles
titles = [movie for movie in movies['title']]

In [58]:
#creating list of extracted keywords for each movie
major_keywords = []
for plot in user_plots:
    plot_keywords = extract(nlp(plot))
    major_keywords.append(plot_keywords)

In [61]:
#creating dictionary for easy look-up of extracted keywords
plot_dict = {}

for i in range(len(titles)):
    title = titles[i]
    key_words = major_keywords[i]
    plot_dict[title] =  key_words

In [62]:
#example of extracted keywords for 'The Godfather'
plot_dict['The Godfather']

'family son head wedding business war life wants drugs ways '

In [63]:
#example of extracted keywords for 'Superbad'
plot_dict['Superbad']

'party alcohol school plan friends seniors girls college sex graduation '

In [67]:
#example of extracted keywords for 'The Shawshank Redemption'
plot_dict['The Shawshank Redemption']

'prison life murder wife imprisoned number years banker way befriends '

In [68]:
#creating column for extracted keywords
movies['user_plot_keywords'] = major_keywords

In [110]:
movies.to_csv('../data/genre_groups/x_final_df.csv')

In [109]:
#creating recommender function
#receives an 'entry' which will be a string of words entered by the user and the genre of movie the user wants returned
def recommend_me(entry, genre):
    df = movies.loc[movies[genre] == 1, :]
    keywords = [words for words in df['user_plot_keywords']]
    scores = [float(nlp(entry).similarity(nlp(word))) for word in keywords]
    
    df['similarity'] = scores
    
    return df[['title', 'similarity']].sort_values(by='similarity', ascending=False).head(10)

In [112]:
recommend_me('church murder', 'Horror')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = scores


Unnamed: 0,title,similarity
2903,The Exorcism of Emily Rose,0.769531
2682,From Hell,0.760454
4219,The Amityville Horror,0.760186
3349,Dead & Buried,0.755772
6127,Exorcist: The Beginning,0.746312
265,The Exorcist,0.742461
5671,3 from Hell,0.739614
2729,The Stepfather,0.737729
6112,Dominion: Prequel to the Exorcist,0.732944
4870,The Amityville Horror,0.730897


In [121]:
recommend_me('wedding holiday trouble', 'Comedy')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = scores


Unnamed: 0,title,similarity
2356,Wedding Crashers,0.82842
5977,Bachelorette,0.81745
5726,Bride Wars,0.812469
567,The Hangover,0.792323
4614,27 Dresses,0.789437
6165,The Romantics,0.789399
1374,Monsoon Wedding,0.785287
5383,Jumping the Broom,0.779602
2502,Ready or Not,0.777858
1055,Palm Springs,0.776335


In [122]:
recommend_me('spy poker', 'Action')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = scores


Unnamed: 0,title,similarity
231,Casino Royale,0.657612
1929,Maverick,0.651979
6513,Spy Kids 3-D: Game Over,0.594798
3525,The Good Thief,0.588413
6464,Home Alone 3,0.58797
3834,Rat Race,0.584542
4527,The Girl in the Spider's Web,0.573776
5541,Wild Card,0.570198
4968,Ride Along 2,0.567726
5695,Pixels,0.566791


In [129]:
recommend_me('killed family revenge', 'Action')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['similarity'] = scores


Unnamed: 0,title,similarity
4402,Shanghai Knights,0.888823
35,Léon: The Professional,0.872891
4042,Taken 2,0.872007
4352,War,0.860872
2477,Conan the Barbarian,0.852384
5950,The Rhythm Section,0.836987
3815,The Punisher,0.832479
6361,Double Team,0.830752
5036,Punisher: War Zone,0.830013
2428,Patriot Games,0.829424


In [36]:
df = pd.read_csv('../data/genre_groups/x_final_df.csv')

In [4]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,imdb_id,title,overviews,join_director,join_cast,vote_average,vote_count,genre,user_plots,...,Mystery,Thriller,Western,Family,Sci-Fi,Romance,Film-Noir,Musical,Music,user_plot_keywords
0,0,tt0111161,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,FrankDarabont,TimRobbins MorganFreeman BobGunton,8.7,18845,Drama,['Two imprisoned men bond over a number of yea...,...,0,0,0,0,0,0,0,0,0,prison life murder wife imprisoned number year...
1,1,tt0068646,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",FrancisFordCoppola,AlPacino MarlonBrando JamesCaan,8.7,14225,Crime Drama,"[""An organized crime dynasty's aging patriarch...",...,0,0,0,0,0,0,0,0,0,family son head wedding business war life want...
2,2,tt0468569,The Dark Knight,Batman raises the stakes in his war on crime. ...,ChristopherNolan,ChristianBale HeathLedger MichaelCaine,8.5,24993,Action Crime Drama,['When the menace known as the Joker wreaks ha...,...,0,0,0,0,0,0,0,0,0,known mob chaos crime people Begins mastermind...
3,3,tt0071562,The Godfather: Part II,In the continuing saga of the Corleone crime f...,FrancisFordCoppola,AlPacino RobertDeNiro RobertDuvall,8.6,8488,Crime Drama,['The early life and career of Vito Corleone i...,...,0,0,0,0,0,0,0,0,0,family business life story son crime saga atte...
4,4,tt0050083,12 Angry Men,The defense and the prosecution have rested an...,SidneyLumet,MartinBalsam JohnFiedler LeeJ.Cobb,8.5,5539,Crime Drama,['A jury holdout attempts to prevent a miscarr...,...,0,0,0,0,0,0,0,0,0,jury case man murder jurors verdict doubt boy ...
5,5,tt0167260,The Lord of the Rings: The Return of the King,Aragorn is revealed as the heir to the ancient...,PeterJackson,ElijahWood IanMcKellen LivTyler,8.5,17557,Action Adventure Drama,"[""Gandalf and Aragorn lead the World of Men ag...",...,0,0,0,0,0,0,0,0,0,Ring forces army earth men battle continue hel...
6,6,tt0110912,Pulp Fiction,"A burger-loving hit man, his philosophical par...",QuentinTarantino,JohnTravolta SamuelL.Jackson UmaThurman,8.5,21015,Crime Drama,"['The lives of two mob hitmen, a boxer, a gang...",...,0,0,0,0,0,0,0,0,0,boxer men boss time wife hit town mob fight end
7,7,tt0108052,Schindler's List,The true story of how businessman Oskar Schind...,StevenSpielberg,LiamNeeson BenKingsley RalphFiennes,8.6,11331,Biography Drama History,['In German-occupied Poland during World War I...,...,0,0,0,0,0,0,0,0,0,factory story businessman occupied save saved ...
8,8,tt1375666,Inception,"Cobb, a skilled thief who commits corporate es...",ChristopherNolan,LeonardoDiCaprio KenWatanabe JosephGordon-Levitt,8.3,28959,Action Adventure Sci-Fi,['A thief who steals corporate secrets through...,...,0,0,0,0,1,0,0,0,0,dreams mind team dream inception secrets idea ...
9,9,tt0137523,Fight Club,A ticking-time-bomb insomniac and a slippery s...,DavidFincher,EdwardNorton BradPitt HelenaBonhamCarter,8.4,21642,Drama,['An insomniac office worker and a devil-may-c...,...,0,0,0,0,0,0,0,0,0,insomniac life fight support group groups soap...


In [5]:
plots = [x for x in df['user_plot_keywords']]

In [9]:
plots[14]

'computer hacker truth life night stranger leads underworld discovers knows '

In [18]:
extract(nlp(df['synopsis'][1]))

'family meeting men business tells father killed wedding assassination daughter '

In [39]:
gs = 'Crime Drama Mystery'

genres = []

for g in gs.split(' '):
    genres.append(g)

genres

['Crime', 'Drama', 'Mystery']