In [1]:
import pandas as pd
import numpy as np
import nltk
import re

from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, Phrases
from gensim.parsing.preprocessing import STOPWORDS as stop_words
from gensim.utils import simple_preprocess
from sklearn.feature_extraction import text
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

In [2]:
pd.set_option('max_columns', None)

# import csv file 
df_moviedetails = pd.read_csv('export_moviedetails_full_v2.csv', sep=',', encoding='iso-8859-1', escapechar='\\')

# drop duplicate record based movie_url
df_moviedetails.drop_duplicates(subset ="movie_url", keep = False, inplace = True) 

# metascore and world wide gross have lot of missing data, while user rating is complete
# transform the user rating column into GOOD and BAD where GOOD: (>6), BAD: (<=6) 
df_moviedetails['rating-enc'] = np.where(df_moviedetails['user_rating'] >6 , "GOOD", "BAD")


In [3]:
# concatenate movie title and plot summary 
df_moviedetails["title_plot"] = df_moviedetails["movie_title"].astype(str).str.cat(df_moviedetails["plot_summary"].astype(str), sep=' ')

In [4]:
# create dataframe df with subset of columns for text analytics
df = df_moviedetails[["movie_url","movie_title","title_plot","rating-enc"]]

# instead of the default index, extract imdb movie ID from url and set it as index
df["movie_id"] = df["movie_url"].str.split("/").str[-2]
#df = df.set_index("movie_id")

# rename key columns' name
df = df.rename(columns={'title_plot': 'text','rating-enc': 'label'})

df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,movie_url,movie_title,text,label,movie_id
0,https://www.imdb.com/title/tt9016974/,Synchronic,Synchronic two new orleans paramedic life rip...,GOOD,tt9016974
1,https://www.imdb.com/title/tt4154796/,Avengers: Endgame,Avengers: Endgame after devastating event ofa...,GOOD,tt4154796
2,https://www.imdb.com/title/tt9608818/,The Friend,The Friend after receiving lifealtering news ...,GOOD,tt9608818
3,https://www.imdb.com/title/tt5363618/,Sound of Metal,Sound of Metal a heavymetal drummer life thro...,GOOD,tt5363618
4,https://www.imdb.com/title/tt8367814/,The Gentlemen,The Gentlemen an american expat try sell high...,GOOD,tt8367814
...,...,...,...,...,...
4995,https://www.imdb.com/title/tt4170186/,Beeba Boys,Beeba Boys with help recent recruit gang lead...,BAD,tt4170186
4996,https://www.imdb.com/title/tt4947084/,Anarkali,Anarkali the story naval officer fall love go...,GOOD,tt4947084
4997,https://www.imdb.com/title/tt5143700/,La guerre des tuques 3D,La guerre des tuques 3D when winter break arr...,GOOD,tt5143700
4998,https://www.imdb.com/title/tt4257950/,Russell Madness,Russell Madness when man inherits grandfather...,BAD,tt4257950


In [5]:
# check for null values in df
df.isnull().any()

movie_url      False
movie_title    False
text           False
label          False
movie_id       False
dtype: bool

In [6]:
"""
text preprocessing - convert to lower case, remove non-word characters, remove spaces from the start
tokenization, then remove stop words. Then save the processed words back to text field.  
"""

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

stop_words = stopwords.words('english')
newStopWords = ['his', 'like', "she'll", 'than', 'also', 'only', "you're", 'through', 'about', 'themselves', "aren't", 'above', 'after', "that's", 'before', 'com', 'she', "she's", "who's", 'has', 'any', "didn't", "i'd", "we've", 'the', 'other', 'else', 'at', 'down', "here's", 'further', 'there', 'these', 'by', 'get', "they'll", 'no', "where's", "shouldn't", 'then', 'himself', 'hers', 'out', "we'll", 'an', 'should', 'under', "let's", 'what', 'if', "isn't", "he'll", 'or', "shan't", 'too', 'same', 'this', "hasn't", "haven't", 'me', 'had', "they've", 'could', 'all', 'some', 'into', 'he', 'until', 'again', 'http', 'k', "hadn't", "couldn't", "i'll", "we'd", 'between', 'ourselves', "when's", 'for', 'doing', 'nor', 'which', 'our', 'was', 'such', 'very', 'own', 'on', 'being', 'am', 'yours', 'would', 'my', 'once', "they'd", 'how', 'to', 'more', 'theirs', 'did', 'when', "can't", 'www', 'does', 'those', 'both', "mustn't", 'ought', "weren't", 'were', 'therefore', 'here', 'over', 'with', 'it', 'not', "you've", "i'm", 'hence', 'against', "she'd", 'her', 'their', "it's", 'can', 'having', 'of', 'they', 'have', 'in', 'itself', 'just', 'from', "how's", 'i', 'we', 'and', 'shall', 'few', 'since', 'whom', 'while', 'you', 'be', 'yourself', "what's", "you'll", 'but', 'yourselves', 'below', 'herself', "i've", 'why', 'during', "he'd", 'who', 'off', 'otherwise', 'been', 'that', "you'd", 'myself', 'because', 'up', "we're", 'as', "wasn't", 'your', "there's", 'him', 'a', 'ours', 'r', 'ever', 'where', "they're", 'are', 'is', "he's", "don't", "doesn't", 'cannot', 'each', 'its', 'them', 'however', 'so', "why's", 'most', "wouldn't", "won't", 'do']
stop_words.extend(newStopWords)
#lemmatizer = WordNetLemmatizer()
for index, row in df.iterrows():
    filter_sentence = ''
    sentence = str(row['text']).lower()
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(words)
    
    df.loc[index, 'text'] = filter_sentence

df.head()

Unnamed: 0,movie_url,movie_title,text,label,movie_id
0,https://www.imdb.com/title/tt9016974/,Synchronic,synchronic two new orleans paramedic life rip...,GOOD,tt9016974
1,https://www.imdb.com/title/tt4154796/,Avengers: Endgame,avengers endgame devastating event ofavengers...,GOOD,tt4154796
2,https://www.imdb.com/title/tt9608818/,The Friend,friend receiving lifealtering news couple fin...,GOOD,tt9608818
3,https://www.imdb.com/title/tt5363618/,Sound of Metal,sound metal heavymetal drummer life thrown fr...,GOOD,tt5363618
4,https://www.imdb.com/title/tt8367814/,The Gentlemen,gentlemen american expat try sell highly prof...,GOOD,tt8367814


In [7]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, Phrases


documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['text'])]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)


In [8]:
def movie_recommender(n):
    print('movies : ' + df.loc[[n]].movie_title.item())
    print('summary : ' + df.loc[[n]].text.item().strip())
    print('\n')
    print('searching similiar movies......') 
    print('\n')
    
    sims = model.docvecs.most_similar(n)
    
    for tagged in sims:
        print(str(tagged[0]) + '. ' + df.loc[[tagged[0]]].movie_title.item())
        print('summary : ' + df.loc[[tagged[0]]].text.item().strip())
        print('\n')
        

In [10]:
movie_recommender(2)

movies : The Friend
summary : friend receiving lifealtering news couple find unexpected support best friend put life hold move family home bringing impact much greater profound anyone imagined


searching similiar movies......


2042. The Death of Stalin
summary : death stalin moscow 1953 power nearly thirty year soviet dictator joseph vissarionovich stalin adrian mcloughlin take ill quickly dy member council ministers scramble power


3798. The Phenom
summary : phenom rookie pitcher undergoes psychotherapy overcome yip


2014. It
summary : summer 1989 group bullied kid band together destroy shapeshifting monster disguise clown prey child derry small maine town


2482. Unhinged
summary : unhinged four american best friend decide take back road travelling wedding england way deadly secret force girl stranded wood discover house occupied miss perkins


4828. Ich bin dann mal weg
summary : ich bin dann mal weg based book ich bin dann mal weg hape kerkeling author describes journey way st 

  
  This is separate from the ipykernel package so we can avoid doing imports until
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
