In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome("./chromedriver.exe",options=options)

In [3]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_content(content):
    content = content.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    content = re.sub('\s+',' ',content)       #repalces repeated whitespace characters with single space
    return content

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

In [4]:
#extracts all Faculty Profile page urls from the Directory Listing Page

def scrape_title_page(url):
    soup = get_js_soup(url,driver) 
    
    # story
    new_story = []
    plot = span = storyline =div_1 = div_2 = None
    # short
    span = soup.find('span', attrs={'class': re.compile('^GenresAndPlot__TextContainerBreakpoint.*')})
    if span:
        plot = span.get_text()
    # long
    div_1 = soup.find('div', attrs={'data-testid': 'storyline-plot-summary'})
    if div_1:
        div_2 = div_1.find('div', attrs={'class': 'ipc-html-content ipc-html-content--base'})
    if div_2:
        storyline = div_2.get_text()    
    if plot:
        new_story.append(plot)
    if storyline:
        new_story.append(storyline)
    new_story = " ".join(new_story)
    return new_story
    
def scrape_full_credit_page(url):
    fc_url = "/".join(url.split("/")[:-1]) + "/fullcredits"
    soup = get_js_soup(fc_url,driver) 
    
    # writer
    div = h4 = table = None
    div = soup.find('div', attrs={'id':"fullcredits_content"})
    if div:
        h4 = div.find('h4', attrs={'id':"writer"})
    if h4:
        table = h4.find_next_sibling("table").find('tbody')
    writers = set()
    if table:
        for td in table.find_all('td',class_='name'):
            doc = td.find('a')
            if doc:
                writers.add(doc.get_text().lstrip().rstrip())
    writers = ", ".join(writers)
    
    # characters
    table = None
    table = div.find('table', attrs={'class':"cast_list"})
    chars = set()
    if table:
        for td in table.find_all('td',class_='character'):
            doc = td.find('a')
            if doc:
                chars.add(doc.get_text().lstrip().rstrip())
    characters = ", ".join(chars)
    return writers, characters

def scrape_dir_page(dir_url,driver, i):
    base = 'https://www.imdb.com/'
    soup = get_js_soup(dir_url,driver) 
    batch = {'url':[], 'title':[], 'story':[], 'writers': [], 'characters':[]}
    for link_holder in soup.find_all('h3',class_='lister-item-header'): #get list of all <h2> of class 'entry-title fusion-post-title'
        doc = link_holder.find('a')
        
        link = doc['href'] #get url
        if not link:
            link = ""
        new_url = base + link
        
        title = doc.get_text() #get url  
        if not title:
            title = ""
        new_title = title
        
        new_story = scrape_title_page(new_url)
        new_writers, new_characters = scrape_full_credit_page(new_url)
        
        batch['url'].append(new_url)
        batch['title'].append(new_title)
        batch['story'].append(new_story)
        batch['writers'].append(new_writers)
        batch['characters'].append(new_characters)
#         break
    print ('-'*20,'Finish scraping {} movies'.format(str(50 * (i+1))),'-'*20)
    return batch

    

In [5]:
movie_urls = []
movie_titles = []
movie_stories = []
movie_writers = []
movie_characters = []

In [28]:
import time
# popularity descending order, scape first 200 pages (50 titles per page)
print ('-'*20,'Scraping directory page','-'*20)
# i should start from 1
last_i = 142
for i in range(last_i, 1000):
    dir_url = 'https://www.imdb.com/search/keyword/?mode=detail&page=' + str(i) + '&title_type=movie&ref_=kw_ref_key&sort=moviemeter,asc'
    try:
        batch = scrape_dir_page(dir_url,driver, i)
        movie_urls.extend(batch['url'])
        movie_titles.extend(batch['title'])
        movie_stories.extend(batch['story'])
        movie_writers.extend(batch['writers'])
        movie_characters.extend(batch['characters'])
        last_i += 1
    except:
        time.sleep(1)
        batch = scrape_dir_page(dir_url,driver, i)
        movie_urls.extend(batch['url'])
        movie_titles.extend(batch['title'])
        movie_stories.extend(batch['story'])
        movie_writers.extend(batch['writers'])
        movie_characters.extend(batch['characters'])
        last_i += 1
        

-------------------- Scraping directory page --------------------
-------------------- Finish scraping 7150 movies --------------------
-------------------- Finish scraping 7200 movies --------------------
-------------------- Finish scraping 7250 movies --------------------
-------------------- Finish scraping 7300 movies --------------------
-------------------- Finish scraping 7350 movies --------------------
-------------------- Finish scraping 7400 movies --------------------
-------------------- Finish scraping 7450 movies --------------------
-------------------- Finish scraping 7500 movies --------------------
-------------------- Finish scraping 7550 movies --------------------
-------------------- Finish scraping 7600 movies --------------------
-------------------- Finish scraping 7650 movies --------------------
-------------------- Finish scraping 7700 movies --------------------
-------------------- Finish scraping 7750 movies --------------------
-------------------- Fin

-------------------- Finish scraping 12950 movies --------------------
-------------------- Finish scraping 13000 movies --------------------
-------------------- Finish scraping 13050 movies --------------------
-------------------- Finish scraping 13100 movies --------------------
-------------------- Finish scraping 13150 movies --------------------
-------------------- Finish scraping 13200 movies --------------------
-------------------- Finish scraping 13250 movies --------------------
-------------------- Finish scraping 13300 movies --------------------
-------------------- Finish scraping 13350 movies --------------------
-------------------- Finish scraping 13400 movies --------------------
-------------------- Finish scraping 13450 movies --------------------
-------------------- Finish scraping 13500 movies --------------------
-------------------- Finish scraping 13550 movies --------------------
-------------------- Finish scraping 13600 movies --------------------
------

-------------------- Finish scraping 18800 movies --------------------
-------------------- Finish scraping 18850 movies --------------------
-------------------- Finish scraping 18900 movies --------------------
-------------------- Finish scraping 18950 movies --------------------
-------------------- Finish scraping 19000 movies --------------------
-------------------- Finish scraping 19050 movies --------------------
-------------------- Finish scraping 19100 movies --------------------
-------------------- Finish scraping 19150 movies --------------------
-------------------- Finish scraping 19200 movies --------------------
-------------------- Finish scraping 19250 movies --------------------
-------------------- Finish scraping 19300 movies --------------------
-------------------- Finish scraping 19350 movies --------------------
-------------------- Finish scraping 19400 movies --------------------
-------------------- Finish scraping 19450 movies --------------------
------

-------------------- Finish scraping 24600 movies --------------------
-------------------- Finish scraping 24650 movies --------------------
-------------------- Finish scraping 24700 movies --------------------
-------------------- Finish scraping 24750 movies --------------------
-------------------- Finish scraping 24800 movies --------------------
-------------------- Finish scraping 24850 movies --------------------
-------------------- Finish scraping 24900 movies --------------------
-------------------- Finish scraping 24950 movies --------------------
-------------------- Finish scraping 25000 movies --------------------
-------------------- Finish scraping 25050 movies --------------------
-------------------- Finish scraping 25100 movies --------------------
-------------------- Finish scraping 25150 movies --------------------
-------------------- Finish scraping 25200 movies --------------------
-------------------- Finish scraping 25250 movies --------------------
------

-------------------- Finish scraping 30400 movies --------------------
-------------------- Finish scraping 30450 movies --------------------
-------------------- Finish scraping 30500 movies --------------------
-------------------- Finish scraping 30550 movies --------------------
-------------------- Finish scraping 30600 movies --------------------
-------------------- Finish scraping 30650 movies --------------------
-------------------- Finish scraping 30700 movies --------------------
-------------------- Finish scraping 30750 movies --------------------
-------------------- Finish scraping 30800 movies --------------------
-------------------- Finish scraping 30850 movies --------------------
-------------------- Finish scraping 30900 movies --------------------
-------------------- Finish scraping 30950 movies --------------------
-------------------- Finish scraping 31000 movies --------------------
-------------------- Finish scraping 31050 movies --------------------
------

-------------------- Finish scraping 36250 movies --------------------
-------------------- Finish scraping 36300 movies --------------------
-------------------- Finish scraping 36350 movies --------------------
-------------------- Finish scraping 36400 movies --------------------
-------------------- Finish scraping 36450 movies --------------------
-------------------- Finish scraping 36500 movies --------------------
-------------------- Finish scraping 36550 movies --------------------
-------------------- Finish scraping 36600 movies --------------------
-------------------- Finish scraping 36650 movies --------------------
-------------------- Finish scraping 36700 movies --------------------
-------------------- Finish scraping 36750 movies --------------------
-------------------- Finish scraping 36800 movies --------------------
-------------------- Finish scraping 36850 movies --------------------
-------------------- Finish scraping 36900 movies --------------------
------

-------------------- Finish scraping 42050 movies --------------------
-------------------- Finish scraping 42100 movies --------------------
-------------------- Finish scraping 42150 movies --------------------
-------------------- Finish scraping 42200 movies --------------------
-------------------- Finish scraping 42250 movies --------------------
-------------------- Finish scraping 42300 movies --------------------
-------------------- Finish scraping 42350 movies --------------------
-------------------- Finish scraping 42400 movies --------------------
-------------------- Finish scraping 42450 movies --------------------
-------------------- Finish scraping 42500 movies --------------------
-------------------- Finish scraping 42550 movies --------------------
-------------------- Finish scraping 42600 movies --------------------
-------------------- Finish scraping 42650 movies --------------------
-------------------- Finish scraping 42700 movies --------------------
------

-------------------- Finish scraping 47900 movies --------------------
-------------------- Finish scraping 47950 movies --------------------
-------------------- Finish scraping 48000 movies --------------------
-------------------- Finish scraping 48050 movies --------------------
-------------------- Finish scraping 48100 movies --------------------
-------------------- Finish scraping 48150 movies --------------------
-------------------- Finish scraping 48200 movies --------------------
-------------------- Finish scraping 48250 movies --------------------
-------------------- Finish scraping 48300 movies --------------------
-------------------- Finish scraping 48350 movies --------------------
-------------------- Finish scraping 48400 movies --------------------
-------------------- Finish scraping 48450 movies --------------------
-------------------- Finish scraping 48500 movies --------------------
-------------------- Finish scraping 48550 movies --------------------
------

In [48]:
movie_urls[-10:]
print(len(movie_urls))

9999


In [40]:
movie_titles[-10:]
print(len(movie_titles))

9999


In [42]:
movie_writers[-10:]
print(len(movie_writers))

9999


In [44]:
movie_stories[-10:]
print(len(movie_stories))

9999


In [46]:
movie_characters[:-10]
print(len(movie_characters))

9999


In [49]:
last_i

1000

In [50]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(process_content(l))
            f.write('\n')

In [51]:
movie_urls_file = '../IMDB_Goodreads_data/movie_urls.txt'
movie_writers_file = '../IMDB_Goodreads_data/movie_writers.txt'
movie_stories_file = '../IMDB_Goodreads_data/movie_stories.txt'
movie_titles_file = '../IMDB_Goodreads_data/movie_titles.txt'
movie_characters_file = '../IMDB_Goodreads_data/movie_characters.txt'

write_lst(movie_urls,movie_urls_file)
write_lst(movie_writers,movie_writers_file)
write_lst(movie_stories,movie_stories_file)
write_lst(movie_titles,movie_titles_file)
write_lst(movie_characters,movie_characters_file)
