In [1]:
#### Disney Datset Creation #### - Python and BeautifulSoup
## Scrape & clean a list of disney wikipedia pages to create a dataset of ### 

In [3]:
from bs4 import BeautifulSoup as bs
import requests

In [4]:
def get_content_value(row_data):
    """ This function gets the content of the info box from the webpage to store as the value of the dictionary
    
        Parameters: 
            row_data: The data in the row of choice
    
    """
    # Return a list of the text inside the "li" tag where the info are stored - Some methods are employed for data cleaning
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    # Return a list of the text when it is divided by the "br" tag - Some methods are employed for data cleaning
    elif row_data.find("br"):
        return[text for text in row_data.stripped_strings]
    # Return a list of the text - Some methods are employed for data cleaning
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clear_tags(soup, tags):
    """ This function any tags that is not necessary for our data 
    
        Parameters: 
            soup: the soup object that we are working on
            tags: the single tag or list of tags that we want to eliminate from our object
    """
    for tag in soup.find_all(tags):
        tag.decompose()

def get_info_box(url):
    """ This function allows you to store all the info for a movie into a dictionary  
    
        Parameters: 
            url: The url of the page where the info box is
    """
    # Request access to the url
    r = requests.get(url)
    # Create the soup object with all the HTML inside
    soup = bs(r.content)
    # Find the "class" and store all the HTML inside info_box
    info_box = soup.find(class_="infobox vevent")
    # Find and store all the "tr" (table row) inside the info box in the info_rows variable
    info_rows = info_box.find_all("tr")

    # Remove tags that are not needed
    remove_tags = ['sup', 'span']
    clear_tags(soup, remove_tags)
    
    # Create empty dictionary
    movie_info = {}

    # Iterate through the data and append the content inside the dictionary
    for index, row in enumerate(info_rows): # enumerate allows you to get the row and the index at the same time
        if index == 0:
            movie_info['title'] = row.find('th').get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    
    return movie_info



In [5]:
# Request access to the website 
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
# Create a soup object where all the HTML is stored for further processing
soup = bs(r.content)
#Selects the specific tag that I need to iterate on
movies = soup.select(".wikitable.sortable i a")
# Base path for each movie
base_path = "https://en.wikipedia.org"

#Create empty list
movie_info_list = []
for index, movie in enumerate(movies):
    # Checker for the progress of the request
    if index % 10 == 0:
        print(index)
    try:
        # Get the relative path of the movie
        relative_path = movie['href']
        # Combine the base and relative path to get the full path to access the page
        full_path = base_path + relative_path
        title = movie['title']
        # Append inside the list the info box inside the page through the function get_info_box(url)
        movie_info_list.append(get_info_box(full_path))

    except Exception as e:
        # If a movie doesn't have an info box, return the movie and the exception
        print(movie.get_text())
        print(e)


0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
Better Nate Than Never
'NoneType' object has no attribute 'find_all'


In [6]:
# Check that the lenght of the movie_info_list matches the expected outcome
len(movie_info_list)

449

In [7]:
# Print out all the Running time for each movie
[movie.get('Running time', 'N/A') for movie in movie_info_list]

['41 minutes (74 minutes 1966 release)',
 '83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '70 min',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '92 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '74 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 min.',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '10

In [8]:
# Turn the running time from a string to an integer
def minutes_to_int(running_time):
    if running_time == "N/A":
        return None
    elif isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

# Loop through the dictionary and append the new running time as an int
for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_int(movie.get('Running time', 'N/A'))

    

In [9]:
[movie.get('Running time (int)', 'N/A') for movie in movie_info_list]

[41,
 83,
 88,
 126,
 74,
 64,
 70,
 42,
 70,
 71,
 75,
 94,
 73,
 75,
 82,
 68,
 74,
 96,
 75,
 84,
 77,
 92,
 69,
 81,
 60,
 127,
 92,
 76,
 75,
 73,
 85,
 81,
 70,
 90,
 80,
 75,
 83,
 83,
 72,
 97,
 75,
 104,
 93,
 105,
 95,
 97,
 134,
 69,
 92,
 126,
 79,
 97,
 128,
 74,
 91,
 105,
 98,
 130,
 89,
 93,
 67,
 98,
 100,
 118,
 103,
 110,
 80,
 79,
 91,
 91,
 97,
 118,
 139,
 92,
 131,
 87,
 116,
 93,
 110,
 110,
 131,
 101,
 108,
 84,
 78,
 75,
 164,
 106,
 110,
 99,
 113,
 108,
 112,
 93,
 91,
 93,
 100,
 100,
 79,
 96,
 113,
 89,
 118,
 92,
 88,
 92,
 87,
 93,
 93,
 93,
 90,
 83,
 96,
 88,
 89,
 91,
 93,
 92,
 97,
 100,
 100,
 89,
 91,
 112,
 115,
 95,
 91,
 95,
 104,
 74,
 48,
 77,
 104,
 128,
 101,
 94,
 104,
 90,
 100,
 88,
 93,
 98,
 112,
 84,
 98,
 97,
 114,
 96,
 100,
 109,
 83,
 90,
 107,
 96,
 103,
 91,
 95,
 105,
 113,
 80,
 101,
 89,
 74,
 90,
 89,
 110,
 74,
 93,
 84,
 83,
 74,
 77,
 107,
 93,
 88,
 108,
 84,
 121,
 89,
 104,
 90,
 86,
 84,
 108,
 107,
 96,
 98,
 105,
 

In [6]:
# Import regular expression library
import re 

# Create basic regex expressions to parse the budget and box office strings
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s)?({number})?\s({amounts})"
value_re = rf"\${number}"

def get_numerical_value(string):
    value_string = re.search(number, string).group()
    num_value = float(value_string.replace(",", ""))
    return num_value

def word_to_value(word):
    value_dict = {"thousand": 1000, "million":1000000, "billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value = get_numerical_value(string)
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value * word_value

def parse_value_syntax(string):  
    return get_numerical_value(string)

def money_conversion(money):
    """ 
    Given either a string or a list of strings as input, return
    a number (int or float) which is equal to the monetary value

    money_conversion("$12.2 million") ---> 12200000   ## Word syntax
    money_conversion("$790,000) ---> 790000           ## Value syntax

    use test_money_conveersion.py to test your solution

    """
    if money == "N/A":
        return None

    elif isinstance(money, list):
        money = money[0]
    

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money, flags=re.I)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None
    

In [7]:
# Apply the money_conversion function and append the new key value pair to the dictionary to have amounts of money as floats
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

In [78]:
# More cleanup for specific data

for movie in movie_info_list:
    if movie['Budget (float)'] == None:
        pass
    elif movie['Budget (float)'] < 400:
        movie['Budget (float)'] = movie['Budget (float)'] * 1000000
    else:
        pass

for movie in movie_info_list:
    if movie['Box office (float)'] == None:
        pass
    elif movie['Box office (float)'] < 400:
        movie['Box office (float)'] = movie['Box office (float)'] * 1000000
    else:
        pass

for movie in movie_info_list:
    if movie['Budget (float)'] == None:
        pass
    elif movie['Budget (float)'] > 10000000000:
        movie['Budget (float)'] = movie['Budget (float)'] / 1000000

In [None]:
print([movie.get('Box office (float)', "N/A") for movie in movie_info_list])

In [None]:
# Convert Dates into datetimes
print([movie.get('Release date', "N/A") for movie in movie_info_list])

In [58]:
# Converts dates into datetime objects
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    elif date == 'N/A':
        return None
    
    date_str = clean_date(date)

    fmts = ['%B %d, %Y', '%d %B %Y', '%B %Y', '%Y']

    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass            
    
    return None


In [59]:
# Apply the date_conversion function and append the new key value pair to store the datevalue object
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [60]:
# Save the dictionary as a pickle file
import pickle

def save_data_pickle(name, data):
    """ This function allows you to save a dictionary into a pickle file

        Parameters:
            name: The name of the new file
            data: The dictionary to be saved into a pickle file
    """
    with open(name, 'wb') as f:
        pickle.dump(data, f,)


In [11]:
# Load the dictionary as a pickle file
import pickle

def load_data_pickle(name):
    """ This function allows you to load a pickle file

        Parameters:
            name: The path of the pickle file to load
            
    """
    with open(name, 'rb') as f:
        return pickle.load(f)

In [79]:
# Save the data into a pickle file
save_data_pickle('disney_movie_final.pickle', movie_info_list)

In [12]:
# Load the data from a pickle file
movie_info_list = load_data_pickle('disney_movie_final.pickle')

In [13]:
movie_info_list[:5]

[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41,
  'Budget (float)': None,
  'Box office (float)': 45472000.0,
  'Release date (datetime)': datetime.datetime(1937, 5, 19, 0, 0)},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
 

In [82]:
# Create the pandas dataFrame
import pandas as pd

df_pickle = pd.DataFrame(data=pd.read_pickle('disney_movie_final.pickle'))

df_pickle[:10]

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Screenplay by,Countries,Production companies,Color process,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45472000.0,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million,126.0,2280000.0,83300000.0,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,,,,,,,,,,
5,Dumbo,Walt Disney Productions,"[October 23, 1941 (New York City), October 31,...",64 minutes,United States,English,$1.3 million (est. United States/Canada rental...,64.0,950000.0,1300000.0,...,,,,,,,,,,
6,Bambi,Walt Disney Productions,"[August 9, 1942 (World Premiere-London), Augus...",70 minutes,United States,English,$267.4 million,70.0,858000.0,267400000.0,...,,,,,,,,,,
7,Saludos Amigos,Walt Disney Productions,"[August 24, 1942 (World Premiere-Rio de Janeir...",42 minutes,United States,,"$1,135,000 (worldwide rentals)",42.0,,1135000.0,...,,,,,,,,,,
8,Victory Through Air Power,Walt Disney Productions,"[July 17, 1943]",70 min,United States,English,"$799,000",70.0,788000.0,799000.0,...,,,,,,,,,,
9,The Three Caballeros,Walt Disney Productions,"[December 21, 1944 (Mexico City), February 3, ...",71 minutes,United States,,"$3,355,000 (worldwide rentals)",71.0,,3355000.0,...,,,,,,,,,,


In [83]:
df_pickle.describe()

Unnamed: 0,Running time (int),Budget (float),Box office (float)
count,437.0,282.0,362.0
mean,97.720824,63751420.0,166846600.0
std,18.928883,71271420.0,271948500.0
min,40.0,300000.0,55513.0
25%,86.0,11250000.0,11000000.0
50%,96.0,30000000.0,45536000.0
75%,107.0,100000000.0,185475000.0
max,167.0,410600000.0,1657000000.0


In [66]:
def high_low(dataframe, column):
    """" This function receives a dataframe and calculates the highest and lowest value and the associated title"""
    
    high_column = dataframe[column].max()
    low_column = dataframe[column].min()
    
    high_index = dataframe[dataframe[column] == dataframe[column].max()].index.values.astype(int)
    low_index = dataframe[dataframe[column] == dataframe[column].min()].index.values.astype(int)
    try: 
        high_index = int(high_index)
        low_index = int(low_index)
        high_title = dataframe['title'][high_index]
        low_title = dataframe['title'][low_index]
    except:
        high_title = dataframe['title'][high_index]
        low_title = dataframe['title'][low_index]
    
    return high_column, high_title, low_column, low_title

In [84]:
## Calculate max/min budget

high_budget, high_budget_title, low_budget, low_budget_title = high_low(df_pickle, 'Budget (float)')

print(f"The highest movie budget: '{high_budget_title}' - ${high_budget:,}")

print(f"The lowest movie budget: '{low_budget_title}' - ${low_budget:,}")

The highest movie budget: 'Pirates of the Caribbean: On Stranger Tides' - $410,600,000.0
The lowest movie budget: 'The Living Desert' - $300,000.0


In [85]:
## Calulate longest/shortest movie

long_movie, long_movie_title, short_movie, short_movie_title = high_low(df_pickle, 'Running time (int)')

print(f"Longest movie: '{long_movie_title}' - {long_movie:.0f} minutes")

print(f"Shortest movie: '{short_movie_title.str.cat(sep=' ')}' - {short_movie:.0f} minutes")


Longest movie: 'Pirates of the Caribbean: At World's End' - 167 minutes
Shortest movie: 'Sacred Planet Roving Mars' - 40 minutes


In [69]:
## Highest/lowest box office

high_box_office, high_box_office_title, low_box_office, low_box_office_title = high_low(df_pickle, 'Box office (float)')

print(f"Highest box office: '{high_box_office_title}' - ${high_box_office:,}")

print(f"Lowest box office: '{low_box_office_title}' - ${low_box_office:,}")

Highest box office: 'The Lion King' - $1,657,000,000.0
Lowest box office: 'The Boys: The Sherman Brothers' Story' - $55,513.0


In [86]:
## Oldest/newest movie

new_movie, new_movie_title, old_movie, old_movie_title = high_low(df_pickle, 'Release date (datetime)')

print(f"Oldest movie released: '{old_movie_title}' - {old_movie:%d %b %Y}")
print(f"Newest movie released: '{new_movie_title}' - {new_movie:%d %b %Y}")


Oldest movie released: 'Academy Award Review of' - 19 May 1937
Newest movie released: 'Turning Red' - 11 Mar 2022
