In [76]:
import nltk
from nltk.tokenize import word_tokenize 
from nltk.chunk import *
from nltk.chunk.util import *
from nltk.chunk.regexp import *
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import itertools

### Code

In [82]:
grammar = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ>*<NN|NNS|NNP|NNPS>+}
    """


def extract_titles(movie_soup):
    title = movie_soup.find('meta', {'name': 'title'})
    return title['content']


def best_worst(reviews):
    worst = reviews.index(min(reviews))

    best = reviews.index(max(reviews))

    return worst, best


def extract_reviews(movie_soup):
    user_review_ratings = [
        tag.previous_element
        for tag in movie_soup.find_all('span', attrs={'class': 'point-scale'})
    ]

    worst, best = best_worst(list(map(int, user_review_ratings)))

    user_review_list = movie_soup.find_all('a', attrs={'class': 'title'})

    w_review_tag = user_review_list[worst]
    b_review_tag = user_review_list[best]

    w_review_link = 'https://www.imdb.com' + w_review_tag['href']
    b_review_link = 'https://www.imdb.com' + b_review_tag['href']

    return w_review_link, b_review_link


def extract_review_text(review_soup):
    tag = review_soup.find('div', attrs={'class': 'text show-more__control'})
    return tag.getText()


def sentence_tokenize(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    sentences = [get_np_tags(sent) for sent in sentences]
    return sentences


def get_np_tags(sentence):
    nps = []

    cp = nltk.RegexpParser(grammar)
    tree = cp.parse(sentence)

    for subtree in tree.subtrees():
        if subtree.label() == 'NP':
            t = subtree
            t = ' '.join(word for word, tag in t.leaves())
            nps.append(t)

    return nps

### 1.	Compile a list of static links (permalinks) to individual user movie reviews from one particular website. This will be your working dataset for this assignment, as well as for assignments 7 and 8, which together will make up your semester project.   

a) it does not matter if you use a crawler or if you manually collect the links, but you will need at least 100 movie review links. Note that, as of this writing, the robots.txt file of IMDB.com allows the crawling of user reviews.

b) Each link should be to a web page that has only one user review of only one movie, e.g., the user review permalinks on the 
IMDB site.

c) Choose reviews of movies that are all in the same genre, e.g., sci-fi, mystery, romance, superhero, etc.  

d) Make sure your collection includes reviews of several movies in your chosen genre and that it includes a mix of negative and positive reviews. 



***Step 1: Get some Movies from IMBD***

In [5]:
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=6.0,10.0
&num_votes=50000,&genres=thriller&view=simple&sort=user_rating,desc&count=250'''
response = requests.get(url, verify=False)
print(response)
soup = BeautifulSoup(response.text)



<Response [200]>


***Step 2: Get urls***

In [9]:
movie_titles = [
    tag.attrs['href'] for tag in soup.findAll('a', attrs={'class': None})
    if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')
]

***Step 3: Get review urls***

In [83]:
movie_urls = [
    'https://www.imdb.com' + x + 'reviews' for x in set(movie_titles)
]

In [84]:
assert len(movie_urls) >= 100

***Step 4: Convert to BS4***

In [13]:
movie_pages = [
    BeautifulSoup(requests.get(url, verify=False).text) for url in movie_urls 
]























***Step 5: Get Titles***

In [38]:
titles = [extract_titles(x) for x in movie_pages]
titles

['Joker (2019) - IMDb',
 'The Machinist (2004) - IMDb',
 'The Night of the Hunter (1955) - IMDb',
 'Mission: Impossible - Fallout (2018) - IMDb',
 'Run Lola Run (1998) - IMDb',
 'The Hunt for Red October (1990) - IMDb',
 'Oldboy (2003) - IMDb',
 'The Fugitive (1993) - IMDb',
 'Mississippi Burning (1988) - IMDb',
 'The Dark Knight (2008) - IMDb',
 'The Others (2001) - IMDb',
 'Dirty Harry (1971) - IMDb',
 'Drishyam (2015) - IMDb',
 'Gangs of Wasseypur (2012) - IMDb',
 'Night of the Living Dead (1968) - IMDb',
 'Jaws (1975) - IMDb',
 'Knives Out (2019) - IMDb',
 'Baby (2015) - IMDb',
 'Bridge of Spies (2015) - IMDb',
 'Eastern Promises (2007) - IMDb',
 'Gattaca (1997) - IMDb',
 'Sholay (1975) - IMDb',
 'Psycho (1960) - IMDb',
 'John Wick: Chapter 2 (2017) - IMDb',
 'Victoria (2015) - IMDb',
 'Paprika (2006) - IMDb',
 'The Manchurian Candidate (1962) - IMDb',
 'Elite Squad 2: The Enemy Within (2010) - IMDb',
 'The Girl with the Dragon Tattoo (2011) - IMDb',
 'Rebecca (1940) - IMDb',
 'Per

***Step 5: Get Review URLS***

In [43]:
reviews = [extract_reviews(x) for x in movie_pages]
reviews

[('https://www.imdb.com/review/rw5408799/',
  'https://www.imdb.com/review/rw5159304/'),
 ('https://www.imdb.com/review/rw1103303/',
  'https://www.imdb.com/review/rw1520492/'),
 ('https://www.imdb.com/review/rw0055730/',
  'https://www.imdb.com/review/rw0055732/'),
 ('https://www.imdb.com/review/rw4348377/',
  'https://www.imdb.com/review/rw5150733/'),
 ('https://www.imdb.com/review/rw0941618/',
  'https://www.imdb.com/review/rw0504859/'),
 ('https://www.imdb.com/review/rw0283693/',
  'https://www.imdb.com/review/rw1327896/'),
 ('https://www.imdb.com/review/rw2102274/',
  'https://www.imdb.com/review/rw1783091/'),
 ('https://www.imdb.com/review/rw1109390/',
  'https://www.imdb.com/review/rw0321769/'),
 ('https://www.imdb.com/review/rw0259572/',
  'https://www.imdb.com/review/rw0259639/'),
 ('https://www.imdb.com/review/rw2599771/',
  'https://www.imdb.com/review/rw5478826/'),
 ('https://www.imdb.com/review/rw1037586/',
  'https://www.imdb.com/review/rw0698894/'),
 ('https://www.imdb.c

***Step 5: Split up good and bad reviews***

In [47]:
bad_reviews = [
    extract_review_text(BeautifulSoup(requests.get(url[0], verify=False).text)) for url in reviews 
]























In [48]:
good_reviews = [
    extract_review_text(BeautifulSoup(requests.get(url[1], verify=False).text)) for url in reviews 
]























### 2: Extract noun phrase (NP) chunks from your reviews using the following procedure:

(a) In Python, use BeautifulSoup to grab the main review text from each link.  

(b) Next run each review text through a tokenizer, and then try to NP-chunk it with a shallow parser. 

(c) You probably will have too many unknown words, owing to proper names of characters, actors, and so on that are not in your working dictionary. Make sure the main names that are relevant to the movies in your collection of reviews are added to the working lexicon, and then run the NP chunker again.

(d)	Output all the chunks in a single list for each review, and submit that output for this assignment. Also submit a brief written summary of what you did (describe your selection of genre, your source of reviews, how many you collected, and by what means).


***Get Everything into a Dataframe***

In [60]:
all_data = list(zip(titles, movie_titles, good_reviews, bad_reviews, [url[1] for url in reviews], [url[0] for url in reviews]))

In [66]:
all_data = pd.DataFrame(all_data)

In [68]:
all_data.columns = pd.Index(['title', 'url', 'good_review', 'bad_review', 'good_url', 'bad_url'])

In [70]:
all_data.set_index(['title'], inplace=True)

In [71]:
all_data

Unnamed: 0_level_0,url,good_review,bad_review,good_url,bad_url
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Joker (2019) - IMDb,/title/tt0468569/,"Every once in a while a movie comes, that trul...",The recurring themes of so many of the 10 star...,https://www.imdb.com/review/rw5159304/,https://www.imdb.com/review/rw5408799/
The Machinist (2004) - IMDb,/title/tt0468569/,THE MACHINIST (Brad Anderson - Spain 2004).Chr...,I really wanted to like The Machinist. As this...,https://www.imdb.com/review/rw1520492/,https://www.imdb.com/review/rw1103303/
The Night of the Hunter (1955) - IMDb,/title/tt1375666/,I still hear the lullaby singing sweetly in my...,"This film is way ahead of its time, not only i...",https://www.imdb.com/review/rw0055732/,https://www.imdb.com/review/rw0055730/
Mission: Impossible - Fallout (2018) - IMDb,/title/tt1375666/,"The good: Wow. The music, the stunts, the acto...",If you scroll down through all these 10/10 rat...,https://www.imdb.com/review/rw5150733/,https://www.imdb.com/review/rw4348377/
Run Lola Run (1998) - IMDb,/title/tt6751668/,"Run Lola Run is a riveting, heart (and pavemen...",Lola Rennt is definitely in my top ten favouri...,https://www.imdb.com/review/rw0504859/,https://www.imdb.com/review/rw0941618/
...,...,...,...,...,...
Open Your Eyes (1997) - IMDb,/title/tt1535109/,"""Abre Los Ojos"" is one of the most astonishing...","Although the plot is confusing at times, a sec...",https://www.imdb.com/review/rw0494745/,https://www.imdb.com/review/rw0494744/
Charade (1963) - IMDb,/title/tt1527788/,Let's see: what we got here is one of the best...,Regina (Audrey Hepburn) comes Paris from a vac...,https://www.imdb.com/review/rw0080852/,https://www.imdb.com/review/rw2099872/
Andhadhun (2018) - IMDb,/title/tt1527788/,Many have reviewed Andhadhun and rated it high...,I had really enjoyed this movie. I can say thi...,https://www.imdb.com/review/rw4526401/,https://www.imdb.com/review/rw4379298/
The Wave (2008) - IMDb,/title/tt1216496/,This is a wonderful film with a superb screenp...,The Wave is like a bad soap opera at times wit...,https://www.imdb.com/review/rw1948400/,https://www.imdb.com/review/rw2432914/


In [75]:
all_data['good_review_chunks'] = all_data['good_review'].apply(sentence_tokenize)
all_data['bad_review_chunks'] = all_data['bad_review'].apply(sentence_tokenize)
all_data.head()

Unnamed: 0_level_0,url,good_review,bad_review,good_url,bad_url,good_review_chunks,bad_review_chunks
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Joker (2019) - IMDb,/title/tt0468569/,"Every once in a while a movie comes, that trul...",The recurring themes of so many of the 10 star...,https://www.imdb.com/review/rw5159304/,https://www.imdb.com/review/rw5408799/,"[[while, movie, impact], [Joaquin, performance...","[[themes, star reviews, lot, movie, audience],..."
The Machinist (2004) - IMDb,/title/tt0468569/,THE MACHINIST (Brad Anderson - Spain 2004).Chr...,I really wanted to like The Machinist. As this...,https://www.imdb.com/review/rw1520492/,https://www.imdb.com/review/rw1103303/,"[[MACHINIST, Brad Anderson, Spain, .Christian ...","[[Machinist], [film, previous effort, Session,..."
The Night of the Hunter (1955) - IMDb,/title/tt1375666/,I still hear the lullaby singing sweetly in my...,"This film is way ahead of its time, not only i...",https://www.imdb.com/review/rw0055732/,https://www.imdb.com/review/rw0055730/,"[[lullaby, head, hazy, dream, opening scene, b...","[[film, way, time, subject matter, cinematic s..."
Mission: Impossible - Fallout (2018) - IMDb,/title/tt1375666/,"The good: Wow. The music, the stunts, the acto...",If you scroll down through all these 10/10 rat...,https://www.imdb.com/review/rw5150733/,https://www.imdb.com/review/rw4348377/,"[[Wow], [music, stunts, actors, cities and lan...","[[ratings, movie, real reviews], [], [], [], [..."
Run Lola Run (1998) - IMDb,/title/tt6751668/,"Run Lola Run is a riveting, heart (and pavemen...",Lola Rennt is definitely in my top ten favouri...,https://www.imdb.com/review/rw0504859/,https://www.imdb.com/review/rw0941618/,"[[Run Lola Run, riveting, heart, pavement, epi...","[[Lola Rennt, ten favourites], [Anyone and eve..."


In [80]:
pd.concat([all_data.good_review_chunks, all_data.bad_review_chunks]).to_csv('np_chunks.csv')

### Results:

* Using IMDB as the source 250 movies were used to gather reviews via python (**requests and Beautiful Soup**).

* Search was limited to only titles in the **thriller** genre with review between **6 and 10** stars and **50k+** total votes.

* For each film, one positive and one negative review were selected. 

* An shallow parser (Regex) was then used to extract NP phrases from the body of each review.

* Further work on the regex parser/sentece tokenizer needs to be done as there are instances where punctuation is throwing off the tokenization.