In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools

pd.options.display.max_colwidth=500

In [None]:
def getSoup(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text , 'html.parser')
  return soup

In [None]:
# API call to select:
## feature films
## which are rated atleast 4.0
## having atleast 50,000 votes
## in the Thriller genre
## sorted by user rating
## limit to 250 movies
url = 'https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.0&num_votes=50000,&genres=thriller&view=simple&sort=user_rating,desc&count=250'

# get the soup object for main api url
movies_soup = getSoup(url)

In [None]:
movie_tags = movies_soup.find_all('a' , attrs = {'class' : None})

movie_tags = [tags.attrs['href'] for tags in movie_tags if tags.attrs['href'].startswith('/title') & tags.attrs['href'].endswith('/')  ]

movie_tags = list(dict.fromkeys(movie_tags))

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")
movie_tags[:10]


There are a total of 250 movie titles
Displaying 10 titles


['/title/tt0468569/',
 '/title/tt1375666/',
 '/title/tt6751668/',
 '/title/tt0114369/',
 '/title/tt0102926/',
 '/title/tt0482571/',
 '/title/tt0407887/',
 '/title/tt0114814/',
 '/title/tt0110413/',
 '/title/tt0054215/']

In [None]:
base_url = 'https://www.imdb.com/'
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]

print("There are a total of " + str(len(movie_links)) + " movies with user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 250 movies with user reviews
Displaying 10 user reviews links


['https://www.imdb.com//title/tt0468569/reviews',
 'https://www.imdb.com//title/tt1375666/reviews',
 'https://www.imdb.com//title/tt6751668/reviews',
 'https://www.imdb.com//title/tt0114369/reviews',
 'https://www.imdb.com//title/tt0102926/reviews',
 'https://www.imdb.com//title/tt0482571/reviews',
 'https://www.imdb.com//title/tt0407887/reviews',
 'https://www.imdb.com//title/tt0114814/reviews',
 'https://www.imdb.com//title/tt0110413/reviews',
 'https://www.imdb.com//title/tt0054215/reviews']

In [None]:
movie_soups = [getSoup(link) for link in movie_links]


In [None]:
def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})

    

    user_review_links = ["https://www.imdb.com" + user_review_tag['href'] for user_review_tag in user_review_list]

    
    return user_review_links


In [None]:
movie_review_links  = [getReviews(movie_soup) for movie_soup in movie_soups]


In [None]:
def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

def getReviewRating(review_url):
    '''Returns the user review rating given the review url.'''

    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    rating = soup.find_all('span', attrs={'class': None})
    
    return rating[1].get_text()




def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()


In [None]:
count = 0
review_texts = []
movie_titles  = []
reviewer_ratings = []


for user_review_link in user_review_links:
    if '\n' not in getReviewRating(user_review_link):
      review_texts.append(getReviewText(user_review_link))
      reviewer_ratings.append(getReviewRating(user_review_link))
      movie_titles.append(getMovieTitle(user_review_link))
      count += 1
      if count % 100 == 0:
        print(str(count))
        


0


In [347]:
df = pd.DataFrame({'movie': movie_titles, 'user_review': review_texts, 'user_rating': reviewer_ratings})

In [352]:
df.head()

Unnamed: 0,movie,user_review,user_rating
0,The Dark Knight,"Confidently directed, dark, brooding, and packed with impressive action sequences and a complex story, The Dark Knight includes a career-defining turn from Heath Ledger as well as other Oscar worthy performances, TDK remains not only the best Batman movie, but comic book movie ever created.",10
1,The Dark Knight,Best movie ever. Heath ledger's work is phenomenal no words......,10
2,The Dark Knight,"Totally one of the greatest movie titles ever made. Everything was great, filming, acting, story. Nothing to complain about",10
3,The Dark Knight,"It is just what you want for the best movie. Great story great acting, thrilling twist.\nJust watched Joker in 2019, I just has to come back and give dark knight a 10. And thanks to Heath Ledger for the exceptional performs.",10
4,The Dark Knight,"I got to see The Dark Knight on Wednesday night, the reason though why I'm writing this movie comment this late is because I didn't wanna just jump and say this movie was awesome, I wanted to think it through, still today, I can't stop thinking about this movie! The Dark Knight lives up to it's hype and goes beyond it, this is the Batman movie that goes where no other Batman movie has gone before. It gave us a real sold story, we are finally told why the villains are the way they are, how al...",10


In [353]:
%cd '/content/drive/MyDrive/Movie Rating Prediction/Sentiment Analysis'

/content/drive/MyDrive/Movie Rating Prediction/Sentiment Analysis


In [354]:
# save the dataframe to a csv file.
df.to_csv('userReviews.csv', index=False)

# pickle the dataframe
df.to_pickle('userReviews.pkl')

# to validate
#temp = pd.read_csv('userReviews.csv')
temp = pd.read_pickle('userReviews.pkl')

In [357]:
temp.shape

(1675, 3)