In [4]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from requests import get
import re
import unicodedata

base_url = 'https://www.imdb.com'

scraping https://www.imdb.com for movie attributes: director, actors, movie critic rating, user rating, movie keywords provided by imdb, 

In [None]:
#scraping the movie links from my imdb seen movies list

section_url = 'https://www.imdb.com/list/ls001933214/?sort=list_order,asc&st_dt=&mode=detail&page=1'
#behind section_url there is a list of over 1000 movies I've seen. the list is on my private imdb account

def next_section_url(url): #function for extracting section link for the next page
    section_url = ''
    imdb_seen = get(url, headers = {"Accept-Language": "en-US, en;q=0.5"}).text
    soup = BeautifulSoup(imdb_seen, "html.parser")
    pagination = soup.find('div', attrs={'class':'list-pagination'})
    if pagination.find('a', 'flat-button lister-page-next next-page') != None:
        section_url = base_url + pagination.find('a', 'flat-button lister-page-next next-page')['href']
    return section_url

def movie_links(url): #extracting the links for each movie's imdb page
    imdb_seen = get(url, headers = {"Accept-Language": "en-US, en;q=0.5"}).text
    soup = BeautifulSoup(imdb_seen, "html.parser")
    sub_list = soup.find('div', attrs={'class':'lister list detail sub-list'})
    list_links = [base_url + dd.a['href'] for dd in sub_list.findAll('div', attrs={'class':'lister-item mode-detail'})]
    return list_links
    

In [None]:
movie_links [] #appending the movie links list with seen movies links:

for i in range(11): #11 is for the eleven pages in my list of movies
    movie_links = movie_links + movie_links(section_url)
    section_url = next_section_url(section_url) 
    
#saving the list of links
import pickle
pickling_on = open("seen_movies_links.pickle","wb")
pickle.dump(movie_links, pickling_on)

In [None]:
#going to be scraping the information for each movie from the movie_links list

def get_keywords(url): #extracting the keyword list that imdb provides
    imdb_kw = get(url, headers = {"Accept-Language": "en-US, en;q=0.5"}).text
    soup = BeautifulSoup(imdb_kw, "html.parser")
    kw_base = soup.find_all('td', 'soda sodavote')
    all_kw = [kw_base[i]['data-item-keyword'] for i in range(len(kw_base))]
    return all_kw

def get_actors(soup): #extracting either the first 10 actors from the actors list, or less if the movie had less
    actor_count = len(soup.find('table', 'cast_list').find_all('tr'))-1
    if actor_count<11:
        actor_list = [soup.find('table', 'cast_list').find_all('tr')[k+1].text.split('\n\n')[2][1:-2] for k in range(actor_count)]
    else:
        actor_list = [soup.find('table', 'cast_list').find_all('tr')[k+1].text.split('\n\n')[2][1:-2] for k in range(10)]
    actors = ','.join(actor_list)
    return actors


def movie_info(movie_url):
    imdb_m = get(movie_url, headers = {"Accept-Language": "en-US, en;q=0.5"}).text #with the headers parameter I am trying to avoid movie title translations
    soup_m = BeautifulSoup(imdb_m, "html.parser")
    title_and_year = soup_m.find('title').text[:-7]
    user_rating=soup_m.find('div', 'ratingValue').text[1:4]
    critic_rating=soup_m.find('div', 'titleReviewBar').contents[1].text[3:5]
    director = soup_m.find('div', 'credit_summary_item').text[11:]
    actors = get_actors(soup_m)
    keywords_link = base_url + soup_m.find('nobr').find('a')['href']
    keywords_list = get_keywords(keywords_link)
    keywords = ','.join(keywords_list)
    joined_info = {'title':title_and_year, 'user_rating':user_rating, 'critic_rating':critic_rating, 'director':director, 'actors':actors, 'keywords':keywords}
    return joined_info

In [None]:
movie_attr=[]

#scraping movie attributes and appending to the list:
for i in range(len(movie_links)):
    informacija = movie_info(movie_links[i])
    movie_attr.append(informacija)

movie_frame = pd.DataFrame.from_dict(movie_attr)

In [None]:
#doing some cleaning on the gathered dataset:

movie_frame['director'] = movie_frame['director'].apply(lambda x: x.strip()) #eliminating the whitespace chars
movie_frame['director'] = movie_frame['director'].apply(lambda x: x.split('(')[0]) #leaving only the first and main director of the movie
movie_frame['director'] = movie_frame['director'].apply(lambda x: x.split('|')[0]) 

#converting numeric columns from string to integer/float
#also normalizing numbers since for one column max is 10 and another max is 100 but the information type is the same
movie_frame['critic_rating'] = pd.to_numeric(movie_frame['critic_rating'], errors='coerce')
movie_frame['user_rating'] = pd.to_numeric(movie_frame['user_rating'], errors='coerce')
movie_frame['user_rating'] = movie_frame['user_rating'].apply(lambda x: x*10)
#adding columns with values of the difference between user given rating and movie critic rating
movie_frame['rating_diff_abs'] = 0
movie_frame['rating_diff_abs'] = (movie_frame['user_rating']-movie_frame['critic_rating']).abs()
#absolute difference
movie_frame['rating_diff'] = 0
movie_frame['rating_diff'] = movie_frame['user_rating']-movie_frame['critic_rating']

#splitting title and and year and creating new column for the year that the movie was released
movie_frame['year'] = 'y'
movie_frame['year'] = movie_frame['title'].apply(lambda x: x.split('(')[1])
movie_frame['year'] = movie_frame['year'].apply(lambda x: x[0:-1])
#eliminating the year part from the title
movie_frame['title'] = movie_frame['title'].apply(lambda x: x.split('(')[0][:-1]) 


In [8]:
#this below is a list of movies that are of preferece in regards to the whole movies list
positive_sentiment = pd.read_csv('positive_names.csv', sep='delimiter', engine='python')
true_outcome = np.array(positive_sentiment['Title'])

#create a new column in the movie data frame for the clasification of the movie. True-if the movie is of preference and False-if it is not
movie_frame['outcome'] = 0
movie_frame['outcome'] = movie_frame['title'].isin(true_outcome)

#save the dataframe:
movie_frame.to_csv('movie_frame.csv', index=False)

### Scraping the web for movie reviews from review sites: 
https://www.rogerebert.com 
https://3brothersfilm.com 
http://www.reelviews.net https://www.nytimes.com/reviews/movies https://brightlightsfilm.com http://www.urbancinefile.com.au 
http://alibi.com
https://moviemet.com

In [3]:
#scraping each of the review sites if there's a link to it in in the imdb page of the movie

class ReviewScraping():
    def __init__(self):
        self.baseurl='https://www.imdb.com'

    def the_reviews(self, movie_url):
        imdb_m = get(movie_url, headers = {"Accept-Language": "en-US, en;q=0.5"}).text 
        soup_m = BeautifulSoup(imdb_m, "html.parser")
        title = soup_m.find('title').text[:-7] #movie title
        rev=soup_m.find('div','titleReviewBarItem titleReviewbarItemBorder').find_all('a')
        if len(rev)>1:
            critics_revs_soup = BeautifulSoup(get(movie_url+rev[1]['href']).text, "html.parser")
            
            ebert_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['Roger Ebert', 'Rogerebert.com', 'RogerEbert.com']])
            ebert_links = [self.baseurl + i['href'] for i in ebert_f]
            brothers_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['3 Brothers']])
            brothers_links = [self.baseurl + i['href'] for i in brothers_f]
            reel_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['ReelViews']])
            reel_links = [self.baseurl + i['href'] for i in reel_f]
            times_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['New York Times']])
            times_links = [self.baseurl + i['href'] for i in times_f]
            bl_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['Bright Lights Film Journal']])
            bright_lights_links = [self.baseurl + i['href'] for i in bl_f]
            alibi_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['Alibi.com', 'Alibi', 'alibi.com']])
            alibi_links = [self.baseurl + i['href'] for i in alibi_f]
            cinefile_f = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['Urban Cinefile']])
            cinefile_links = [self.baseurl + i['href'] for i in cinefile_f]
            metro = critics_revs_soup.find_all('a', text = [re.compile(p) for p in ['Movie Metropolis']])
            metro_links = [self.baseurl + i['href'] for i in metro]
            
            ebert_reviews = self.rogerebert(ebert_links)
            lights_reviews = self.bright_lights(bright_lights_links)
            alibi_reviews = self.alibi(alibi_links)
            cinefile_reviews = self.cinefile(cinefile_links)
            brothers_reviews = self.brothers(brothers_links)
            reel_reviews = self.reel_views(reel_links)
            times_reviews = self.ny_times(times_links)
            metro_reviews = self.moviemet(metro_links)
            
        info = {'title':title, 'ebert':ebert_reviews, '3brothers':brothers_reviews, 'reel':reel_reviews, 'nytimes':times_reviews, 'bright_lights':lights_reviews, 'alibi':alibi_reviews, 'cinefile':cinefile_reviews, 'metropolitan':metro_reviews}
        return info


    def rogerebert(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            if window.find('h2', 'page-head') is not None: #this is a structure of a regular review on rogerebert.com
                raw_review = window.find('div', {'itemprop':'reviewBody'}).findAll('p')
                #uncoding the special characters
                clean_review1 = [unicodedata.normalize("NFKD", i.text.strip()) for i in raw_review] 
                #removing whitespace manually since .strip() only works for when those characters ar in the beggining or end of a string
                clean_review2 = [i.replace('\r\n', '').replace('\r', '').replace('\n','').replace('\\', '').replace('\t','') for i in clean_review1]
                clean_review3 = [re.sub(r'\'', r'', i) for i in clean_review2]
                clean_review = [i for i in clean_review3 if not i in 'Advertisement']
                reviews.append(' '.join(clean_review))
        return reviews

    def bright_lights(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            raw_review = window.find('div', 'text').text.strip()
            clean_review1 = raw_review.replace('\r\n', '').replace('\n', '').replace('\r', '').replace('\\', '').replace('\t','')
            clean_review2 = re.sub(r'[^\x00-\x7f]',r'', clean_review1)
            clean_review = re.sub(r'\'', r'', clean_review2) 
            reviews.append(clean_review)
        return reviews

    
    def alibi(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            if window.find('div', {'itemprop':'reviewBody'}) is not None:
                raw_review = window.find('div', {'itemprop':'reviewBody'}).text.strip()
                clean_review1 = raw_review.replace('\r\n', '').replace('\n', '').replace('\r', '').replace('\\', '').replace('\t','')
                clean_review2 = re.sub(r'[^\x00-\x7f]',r' ', clean_review1)
                clean_review = re.sub(r'\'', r'', clean_review2)
                reviews.append(clean_review)
        return reviews     

    
    def cinefile(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            if window.find('font', {'class':'articleBody'}) is not None:
                b_tags = window.find('font', {'class':'articleBody'}).find_all('b') #eliminate b tags
                for i in b_tags:
                    i.decompose()
                raw_review = window.find('font', {'class':'articleBody'}).text.strip()
                clean_review1 = raw_review.replace('\r\n', ' ').replace('\n', ' ').replace('\r', ' ').replace('\\', ' ').replace('\t',' ')
                clean_review2 = re.sub(r'[^\x00-\x7f]',r'', clean_review1)
                clean_review = re.sub(r'\'', r'', clean_review2)
                reviews.append(clean_review)
        return reviews

    
    def brothers(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            raw_review = window.findAll('p')
            clean_review1 = [re.sub('<[^<]+?>', '', i.text) for i in raw_review[:-4]]
            clean_review2 = [i.replace('\r\n', '').replace('\r', '').replace('\n','').replace('\\', '').replace('\t','') for i in clean_review1]
            clean_review = [re.sub(r'\'', r'', i) for i in clean_review2]
            reviews.append(' '.join(clean_review))
        return reviews
    
    
    def reel_views(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            if window.find('div', {'id':'reelContent'}) is not None:
                raw_review = window.find('div', {'id':'reelContent'}).find_all('p')
                clean_review1 = [unicodedata.normalize("NFKD", re.sub('<[^<]+?>', '', i.text.strip())) for i in raw_review] 
                clean_review2 = [re.sub(r'[^\x00-\x7f]',r'', i) for i in clean_review1]
                clean_review3 = [i.replace('\r\n', '').replace('\r', '').replace('\n','').replace('\\', '').replace('\t','') for i in clean_review2]
                clean_review = [re.sub(r'\'', r'', i) for i in clean_review3]
                reviews.append(' '.join(clean_review))
        return reviews

    
    def moviemet(self, links):
        reviews = []
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            if window.find('div', "clear") is not None:
                raw_review = window.find('div', "clear").findAll('p')
                clean_review1 = [re.sub('<[^<]+?>', '', i.text) for i in raw_review]
                clean_review2 = [i.replace('\r\n', '').replace('\r', '').replace('\n','').replace('\\', '').replace('\t','') for i in clean_review1]
                clean_review3 = [re.sub(r'\'', r'', i) for i in clean_review2]
                clean_review = [re.sub(r'[^\x00-\x7f]',r'', i) for i in clean_review3]
                reviews.append(' '.join(clean_review))
        return reviews  
    
    
    def ny_times(self, links):
        reviews = []
        def review_cleaning(raw_rev):
            clean_review1 = [re.sub(r'[^\x00-\x7f]',r'', i) for i in raw_rev]
            clean_review2 = [i.replace('\r\n', '').replace('\r', '').replace('\n','').replace('\\', '').replace('\t','') for i in clean_review1]
            clean_rev = [re.sub(r'\'', r'', i) for i in clean_review2]
            return clean_rev
        
        for i in range(len(links)):
            window = BeautifulSoup(get(links[i]).text, "html.parser")
            archive = window.find('span', 'kicker-label')
            archive2 = window.find('blockquote')
            regular_rev1 = window.find('p', 'css-1pdd3ka etcg8100')
            regular_rev2 = None
            if window.find('h3') is not None:
                regular_rev2 = window.find('h3').find('span', 'article-kicker')
            regular_rev3 = window.find('span', "css-17xtcya")#.find('a')
            if (archive is not None) and ((archive.text == 'Archives') or (archive.text == 'Movies')):
                raw_review = window.find_all('p', 'story-body-text story-content')
                raw_review2 = [i.text.strip() for i in raw_review[:-3]]
                clean_review = review_cleaning(raw_review2)
                reviews.append(' '.join(clean_review))
            elif archive2 is not None:
                raw_review = window.find('blockquote').select('blockquote > p')
                raw_review2 = [i.text.strip() for i in raw_review]
                raw_review3 = [i.replace('&aposs', '').replace('&apos', '') for i in raw_review2]
                clean_review = review_cleaning(raw_review3)
                reviews.append(' '.join(clean_review))
            elif (regular_rev1 is not None) and (regular_rev1.text == 'Movie Review'):
                raw_review = window.find('section', {'name':'articleBody'}).find_all('p', 'css-1ygdjhk e2kc3sl0')
                raw_review2 = [i.text.strip() for i in raw_review[:-3]]
                clean_review = review_cleaning(raw_review2)
                reviews.append(' '.join(clean_review)) 
            elif (regular_rev2 is not None) and (regular_rev2.text == 'Movie Review'):
                raw_review = window.find_all('p', 'story-body-text story-content')
                raw_review2 = [i.text.strip() for i in raw_review[:-3]]
                clean_review = review_cleaning(raw_review2)
                reviews.append(' '.join(clean_review))
            elif (regular_rev3 is not None) and (regular_rev3.text == 'Movies'):
                raw_review = window.find('section', {'name':'articleBody'}).find_all('p', 'css-1ygdjhk evys1bk0')
                raw_review2 = [i.text.strip() for i in raw_review]
                clean_review = review_cleaning(raw_review2[:-1])
                reviews.append(' '.join(clean_review)) 
        return reviews

scrape = ReviewScraping()

### Filling an empty list with movie title and all availabe reviews for that particular movie:

In [None]:
review_dictionary=[]
for i in movie_links:
    review_dictionary.append(scrape.the_reviews(i))


In [None]:
df = pd.DataFrame.from_dict(review_dictionary)

#unpivoting the dataframe:
df2 = pd.melt(df, id_vars='title', value_vars=['3brothers', 'alibi', 'bright_lights', 'cinefile', 'ebert', 'metropolitan', 'nytimes', 'reel'])

#removing empty rows:
df3 = df2[df2['value'].map(lambda x: (x!='[]') and (x!="['']") and (len(x)> 0))].reset_index(drop=True)

## Based on scraping, some samples have several reviews from the same source. Those need to be separated:

In [None]:
#exctracting datafreames with double reviews:
double_reviews1 = df3.loc[df3['value'].str.contains("', '")]
double_reviews2 = df3.loc[df3['value'].str.contains("', '")]
#separating the reviews:
doubles1 = double_reviews1['value'].apply(lambda x: x.split("', '")[0])
doubles2 = double_reviews2['value'].apply(lambda x: x.split("', '")[1])
#update the review column:
double_reviews1['review']=doubles1
double_reviews2['review']=doubles2

#drop the rows from the initial dataframe that have double reviews:
df3_upd = df3.drop(df3.loc[df3['value'].str.contains("', '")].index)
#concatinate the dataframes
DF = df3_upd.append(double_reviews1, ignore_index=True).append(double_reviews2, ignore_index=True)

## Sample now is expanded to a size of 2771.
### Below is a counter of all critic's reviews

In [35]:
print('Total sample size:', len(DF),'\n\n', DF['variable'].value_counts())

Total sample size: 2771 

 ebert            805
nytimes          631
cinefile         593
reel             445
alibi            166
metropolitan      62
bright_lights     47
3brothers         22
Name: source, dtype: int64


### Adding sentiment column to the movie:

In [13]:
DF['outcome'] = 0
DF['outcome'] = DF['title'].apply(lambda x: x.split('(')[0][:-1]).isin(true_outcome).astype(int)

#rename the columns appropriately:
DF.columns = ['title', 'source', 'review', 'outcome']

In [15]:
DF.to_csv('title_review_outcome.csv', index=False)