In [None]:
from bs4 import BeautifulSoup as bs
import requests
from requests.exceptions import MissingSchema
import re
import csv
import pandas as pd
import time
import numpy as np

In [None]:
# Create list of URL's for web scaping
list_df = pd.DataFrame(columns=['link', 'year'],
                  data=[['https://www.boxofficemojo.com/year/2019/?ref_=bo_yl_table_2', '2019'], 
                        ['https://www.boxofficemojo.com/year/2018/?ref_=bo_yl_table_3', '2018'], 
                        ['https://www.boxofficemojo.com/year/2017/?ref_=bo_yl_table_4', '2017'], 
                        ['https://www.boxofficemojo.com/year/2016/?ref_=bo_yl_table_5', '2016'],
                        ['https://www.boxofficemojo.com/year/2015/?ref_=bo_yl_table_6', '2015'], 
                        ['https://www.boxofficemojo.com/year/2014/?ref_=bo_yl_table_7', '2014'], 
                        ['https://www.boxofficemojo.com/year/2013/?ref_=bo_yl_table_8', '2013'],
                        ['https://www.boxofficemojo.com/year/2012/?ref_=bo_yl_table_9', '2012'], 
                        ['https://www.boxofficemojo.com/year/2011/?ref_=bo_yl_table_10', '2011'], 
                        ['https://www.boxofficemojo.com/year/2010/?ref_=bo_yl_table_11', '2010']])

# Collect url for each movies
header = ['Movie URL', 'IMDB URL', 'Detail URL', 'Release']
movie_link_data = []

for index, row in list_df.iterrows():
    url = row['link']
    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    items = soup.findAll('tr')
    for table_items in items:
        row_dict = {}
        for movie in table_items.findAll('td', class_='a-text-left mojo-field-type-release mojo-cell-wide'):
            movie_url = 'https://www.boxofficemojo.com{}'.format(movie.find('a')['href'])
            movie_response = requests.get(movie_url)
            movie_soup = bs(movie_response.text, 'lxml')
            imdb_url = movie_soup.find('a', href=True, text='Crew information')['href']
            mv_id = imdb_url[27:-53]
            detail_url = 'https://www.boxofficemojo.com/title/{}/credits/?ref_=bo_tt_tab#tabs'.format(mv_id)
            row_dict['movie'] = movie_url
            row_dict['imdb'] = imdb_url
            row_dict['detail'] = detail_url

        for release in table_items.findAll('td', class_='a-text-left mojo-field-type-date a-nowrap'):
            row_dict['Release'] = release.text + ' ' + row['year']
  
        movie_link_data.append(row_dict)

movie_pages_df = pd.DataFrame(movie_link_data)
movie_pages_df.to_csv('movie_urls.csv', index=False)
print('{} movies are collected !'.format(len(movie_pages_df.index)))   

In [None]:
# Collect detail info from saved movie url link for each movies.
movie_pages_df = pd.read_csv('movie_urls.csv')

header = ['Movie', 'Title', 'Distributor', 'Release', 'MPAA', 'Time', 'Genres', 'Domestic', 'International', 'Worldwide', 'Opening', 'Budget', 'Actor_1', 'Actor_2', 'Actor_3', 'Actor_4']
movie_detail_data = []

for index, row in movie_pages_df.iterrows():
    dt_url = row['detail']
    dt_release = row['Release']

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'}

    try:
        dt_response = requests.get(dt_url, headers=headers)
    

    except MissingSchema:
        print('Error: {} --- URL : {}'.format(dt_url))

    if response.status_code == 200:
        time.sleep(2)
    else:
        print('{}: Received status code {}.'.format(row['detail'], response.status_code))

        
    body = bs(dt_response.text, 'lxml')
 
    if body:
        row_dict = {}
        row_dict['movie'] = dt_url
        
        title = body.find('h1')
        if title:
            row_dict['Title'] = title.text[:-6]
        else:
            row_dict['Title'] = 'N/A'

        dom_dist = body.find(text='Domestic Distributor')
        if dom_dist:
            row_dict['Distributor'] = dom_dist.next.text.strip()
        else:
            row_dict['Distributor'] = 'N/A'

        row_dict['Release'] = dt_release

        mpaa = body.find(text='MPAA')
        if mpaa:
            row_dict['MPAA'] = mpaa.next.text.strip()
        else:
            row_dict['MPAA'] = 'N/A'

        runtime = body.find(text='Running Time')
        if runtime:
            row_dict['time'] = runtime.next.text.strip()
        else:
            row_dict['time'] = 'N/A'

        genres = body.find(text='Genres')
        if genres:
            row_dict['Genres'] = genres.next.text.strip()
        else:
            row_dict['Genres'] = 'N/A'

        m_rows = body.find(attrs={'class':'a-section a-spacing-none mojo-performance-summary-table'}).findAll('span')
        try:
            if m_rows:
                if (len(m_rows) >= 3):
                    row_dict['Domestic'] = m_rows[2].text.strip()
                else:
                    row_dict['Domestic'] = 'N/A'

                if (len(m_rows) >= 7):   
                    row_dict['International'] = m_rows[6].text.strip()
                else:
                    row_dict['International'] = 'N/A'
 
                if (len(m_rows) >= 10):   
                    row_dict['Worldwide'] = m_rows[9].text.strip()
                else:
                    row_dict['Worldwide'] = 'N/A'
        except e as AttributeError:
            print('Error: {} --- URL : {}'.format(e,dt_url))
                    
        opening = body.find(text='Domestic Opening')
        if opening:
            row_dict['Opening'] = opening.next.text.strip()
        else:
            row_dict['Opening'] = 'N/A'

        budget = body.find(text='Budget')
        if budget:
            row_dict['Budget'] = budget.next.text.strip()
        else:
            row_dict['Budget'] = 'N/A'
           
        actors = body.find(id='principalCast')
        actor_list=[]  
        if actors:
            a_rows = actors.findAll('tr')
            if a_rows:
                for a_tr in a_rows:
                    a_cols = a_tr.find('a')
                    if a_cols:
                        actor_list.append(a_cols.text.strip())
                
        else:
            row_dict['Actor_1'] = 'N/A'
            row_dict['Actor_2'] = 'N/A'
            row_dict['Actor_3'] = 'N/A'
            row_dict['Actor_4'] = 'N/A'
        
        for i in range(len(actor_list)):
            title = 'Actor_'+str(i+1)
            row_dict[title] = actor_list[i]
            
        movie_detail_data.append(row_dict)       
            
movie_df = pd.DataFrame(movie_detail_data)
movie_df.to_csv('movie_details.csv', index=False)

print('Collected {} info for all movies!'.format(len(movie_df.index)))
