In [4]:
!pip install requests



In [30]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import pickle
from datetime import datetime

In [97]:
## Save list_movie_data in with pickle
def save_data(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

## Load pickle file data 
def load_data(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)
    
#function for scraping infobox
def scrape(url):
    page = requests.get(url)
    toScrape = bs(page.content, 'html.parser')
    movie_details = toScrape.find(class_='infobox vevent')
    rows = movie_details.find_all('tr') 
    movie_data = {}

    movie_data['Title'] = rows[0].find('th').get_text()
    clean_tags(movie_details)
    for i, row in enumerate(rows):
        try:
            if i <= 1:
                continue
            elif row.find('th').get_text() == 'Based on':
                continue
            elif row.find('th').get_text() == 'Running time':
                movie_data['Running time (min)'] = clean(row)
            elif row.find('th').get_text() == 'Release date':
                date = clean(row)
                movie_data['Release date'] = date
                movie_data['Release date (dt)'] = dt_conversion(date)
            else:
                column = row.find('th').get_text(' ', strip=True)
                data = clean(row)
                movie_data[column] = data
        except:
            pass
        
    return movie_data

#Convert date str to datetime object
def dt_conversion(date):
    patterns = ['%B %d, %Y', '%d %B %Y']
    for pat in patterns:
        try:
            return datetime.strptime(date, pat)
        except:
            pass
    return None

#remove troublesome tags
def clean_tags(content):
    t = ['sup', 'span']
    tags = content.find_all(t)
    for tag in tags:
        tag.decompose()

#function to clean data scraped from wikipedia infobox
def clean(row):
    if row.find('th').get_text() == 'Release date':
        return row.find('td').get_text().split('(')[0].replace('\xa0', ' ').strip('\n').strip(' ')                                                                                         
    elif row.find('th').get_text() == 'Running time':
        return int(row.find('td').get_text().split(' ')[0])
    elif row.find('br'):
        return [text for text in row.find('td').stripped_strings]
    elif row.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row.find_all('li')]
    else:
        if '$' in row.find('td').get_text():
            if 'million' in row.find('td').get_text():
                if '-' in row.find('td').get_text():
                    number = float(row.find('td').get_text().split('-')[0].replace('$',''))
                else:
                    number = float(row.find('td').get_text().split(' ')[0].replace('$',''))
                money = number * (10**6)
                return money
            elif 'billion' in row.find('td').get_text():
                if '-' in row.find('td').get_text():
                    number = float(row.find('td').get_text().split('-')[0].replace('$',''))
                else:
                    number = float(row.find('td').get_text().split(' ')[0].replace('$',''))
                money = number * (10**9)
                return money
    return row.find('td').get_text()

# load in table of WBMovies 
# https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2000%E2%80%932009)
# https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2010%E2%80%932019)
links = ['https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2000%E2%80%932009)', 
         'https://en.wikipedia.org/wiki/List_of_Warner_Bros._films_(2010%E2%80%932019)']
list_movie_data = []
for link in links:
    page = requests.get(link)
    content = bs(page.content, 'html.parser')
    table_rows = content.select('.wikitable.sortable i')

    #loop through table rows and scrape entry given the url
    #Append movie info each iteration
    for i,row in enumerate(table_rows):
        if i % 10 == 0:
            print(i);
        try:
            path = row.find('a')['href']
            movie_url = 'https://en.wikipedia.org' + path
            list_movie_data.append(scrape(movie_url))
        except Exception as e:
            print(movie_url)
            print(e)
    
save_data('WB_movie_data.pickle', list_movie_data)

0
10
20
30
40
https://en.wikipedia.org/wiki/Harry_Potter_and_the_Philosopher%27s_Stone_(film)
'NoneType' object is not subscriptable
50
60
70
80
90
100
110
120
130
140
https://en.wikipedia.org/wiki/The_Dukes_of_Hazzard_(film)
'NoneType' object is not subscriptable
150
160
170
180
190
200
210
220
https://en.wikipedia.orghttps://nl.wikipedia.org/wiki/Morrison_krijgt_een_zusje
HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: //nl.wikipedia.org/wiki/Morrison_krijgt_een_zusje (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000023FC3150F70>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
https://en.wikipedia.orghttps://nl.wikipedia.org/wiki/Hoe_overleef_ik_mezelf%3F_(film)
HTTPSConnectionPool(host='en.wikipedia.orghttps', port=443): Max retries exceeded with url: //nl.wikipedia.org/wiki/Hoe_overleef_ik_mezelf%3F_(film) (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object

270
280
290
300
https://en.wikipedia.org/w/index.php?title=Kabir_Azab%C4%B1&action=edit&redlink=1
'NoneType' object has no attribute 'find_all'
310
320
330
https://en.wikipedia.org/wiki/Western_Stars#Film
'NoneType' object has no attribute 'find_all'
340


In [120]:
#load movie data 
movie_data = load_data('WB_movie_data.pickle')
#create pandas dataframe
df = pd.DataFrame(movie_data)

##dropping irrelevant columns
for x in range(28):
    df.drop(df.columns[26], axis=1, inplace = True)
df.drop(df.columns[[5,6,7,18,20,21,22,24,25]], axis=1, inplace = True)
df.to_csv('WB_movie_data_cleaned.csv')