In [11]:
from bs4 import BeautifulSoup as bs
import requests
import json

In [12]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ',strip=True).replace('\xa0',' ') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ',strip=True).replace('\xa0',' ')

def clean_tags(soup):
    for tag in soup.find_all(['sup','span']):
        tag.decompose()    

    
def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_='infobox vevent')
    table_row = info_box.find_all('tr')
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(table_row):
        if index == 0:
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find('th').get_text(' ',strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value
    return movie_info

In [13]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup = bs(r.content)
movies = soup.select('.wikitable.sortable i')

In [14]:
base_path = 'https://en.wikipedia.org/'
movie_info_list = []
for index, movie in enumerate(movies):
    try:
        path = movie.a['href']
        title = movie.a['title']
        full_path = base_path + path
        movie_info_list.append(get_info_box(full_path))
    except:
        pass

In [15]:
def save_data(title,data):
    with open(title, 'w', encoding = 'utf-8') as f:
        json.dump(data, f , ensure_ascii=False, indent=2)
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [16]:
def minute_to_int(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time,list):
        entry = running_time[0]
        value = int(entry.split(' ')[0])
        return value    
    else:
        value = int(running_time.split(' ')[0])
        return value

for movie in movie_info_list:
    movie['Running time (int)'] = minute_to_int(movie.get('Running time','N/A'))

In [17]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None
    
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [18]:
money_conversion(str(movie_info_list[230]["Budget"]))

65000000.0

In [19]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [21]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [22]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [24]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [25]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [26]:
save_data("disney_data_final.json", movie_info_copy)

In [27]:
import pandas as pd

df = pd.DataFrame(movie_info_list)
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Production companies,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified,Created by,Original work,Owned by
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million,126.0,2280000.0,83300000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 20, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,,,,,,,,,,


In [28]:
df.to_csv("disney_movie_data_final.csv")