#### Disney Dataset Creation (w/ Python BeautifulSoup)
###### Scrape & clean a list of disney wikipedia pages to create a dataset to further analyze

#### Get Info Box For One Movie

In [None]:
from bs4 import BeautifulSoup as bs
import requests
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")
for row in info_rows:
    print(row.prettify())

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value

movie_info

# **Get info box for all movies**

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]

In [6]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")

    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value

    return movie_info

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']

        movie_info_list.append(get_info_box(full_path))

    except Exception as e:
        print(movie.get_text())
        print(e)

### **Save/Reload Movie Data**

In [8]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

save_data("disney_data_cleaned.json", movie_info_list)

### **Clean our data!**

In [9]:

# import json

# def load_data(title):
#     with open(title, encoding="utf-8") as f:
#         return json.load(f)

In [10]:
# movie_info_list = load_data("disney_data_cleaned.json")

In [11]:
# with open('disney_data_cleaned.json') as f:
#     data = json.load(f)
# data[-1]

#### Clean up references like [1], [2] etc..
- We found this in sup and span in the html so we jest get skipped them as we did in the function clean_tags

#### Split up the long strings
- Those found in br in the html so we just listed them and get them separated

#### Convert running time into an integer

In [12]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS and Wild Discovery version)', '71 minutes (original)'], '127 minutes', '93 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 minutes', '80 minutes', '75 minutes', '84 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '73 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 minutes', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 minutes', '110 minutes', '80 min.', '79 minut

In [13]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None

    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])
#print(minutes_to_integer(['88 minutes', '90 minutes']))
for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [14]:
movie_info_list[-10]
#print([movie.get('Running time', 'N/A') for movie in data])

{'title': 'National Treasure: Book of Secrets',
 'Directed by': 'Jon Turteltaub',
 'Screenplay by': ['Cormac Wibberley', 'Marianne Wibberley'],
 'Story by': ['Gregory Poirier',
  'Cormac Wibberley',
  'Marianne Wibberley',
  'Ted Elliott',
  'Terry Rossio'],
 'Based on': ['Characters', 'by', 'Jim Kouf', 'Oren Aviv', 'Charles Segars'],
 'Produced by': ['Jerry Bruckheimer', 'Jon Turteltaub'],
 'Starring': ['Nicolas Cage',
  'Jon Voight',
  'Harvey Keitel',
  'Ed Harris',
  'Diane Kruger',
  'Justin Bartha',
  'Bruce Greenwood',
  'Helen Mirren'],
 'Cinematography': ['John Schwartzman', 'Amir Mokri'],
 'Edited by': ['William Goldenberg', 'David Rennie'],
 'Music by': 'Trevor Rabin',
 'Production companies': ['Walt Disney Pictures',
  'Jerry Bruckheimer Films',
  'Junction Entertainment',
  'Saturn Films'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['December 21, 2007'],
 'Running time': '124 minutes',
 'Country': 'United States',
 'Language': 'English',
 '

#### Convert Budget & Box office to numbers

In [15]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['$1.5 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1.8 million', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '$6.3 m

In [16]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [17]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [19]:
print([movie.get('Budget (float)') for movie in movie_info_list])

[1500000.0, 2600000.0, 2280000.0, 600000.0, 950000.0, 858000.0, None, 788000.0, None, 1350000.0, 2125000.0, None, 1500000.0, 1500000.0, None, 2200000.0, 1800000.0, 3000000.0, None, 4000000.0, 2000000.0, 300000.0, 1800000.0, None, 5000000.0, None, 4000000.0, None, None, None, None, None, None, 700000.0, None, None, None, None, None, 6000000.0, 1000000.0, None, 2000000.0, None, None, 2500000.0, None, None, 4000000.0, 3600000.0, None, None, None, None, 3000000.0, None, 3000000.0, None, None, None, None, None, None, None, None, None, 3000000.0, None, None, None, None, 4400000.0, None, None, None, None, None, None, None, None, None, None, None, 4000000.0, None, 5000000.0, None, None, None, None, 5000000.0, None, None, None, None, None, None, 4000000.0, None, None, None, 6300000.0, None, None, None, None, None, None, None, None, 5000000.0, None, None, None, None, 8000000.0, None, None, None, None, None, None, 1000000.0, None, None, None, None, 5000000.0, None, None, 7500000.0, None, 10000000

#### Convert Dates into datetimes

In [20]:
print([movie.get('Release date', 'N/A') for movie in movie_info_list])


['N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A', 'N/A', 'N/A', ['September 27, 1947'], 'May 27, 1948', 'N/A', ['October 5, 1949'], 'N/A', 'N/A', 'N/A', 'N/A', ['February 5, 1953'], ['July 23, 1953 (United States)'], ['November 10, 1953'], 'N/A', ['August 17, 1954'], ['December 23, 1954'], 'May 25, 1955', ['June 22, 1955'], ['September 14, 1955'], 'December 22, 1955', 'June 8, 1956', ['July 18, 1956'], ['September 4, 1956'], ['December 20, 1956'], 'June 19, 1957', 'August 28, 1957', ['December 25, 1957'], ['July 8, 1958'], ['August 12, 1958'], ['December 25, 1958'], ['January 29, 1959'], ['March 19, 1959'], 'N/A', ['November 10, 1959'], 'January 21, 1960 ( Sarasota, FL )', ['February 24, 1960'], 'May 19, 1960', 'N/A', ['November 1, 1960'], ['December 21, 1960'], ['January 25, 1961'], 'March 16, 1961', ['June 21, 1961'], ['July 12, 1961'], ['July 17, 1961'], ['December 14, 1961'], 'April 5, 1962', ['May 17, 1962'], ['June 6, 1962'], 

In [21]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]

    if date == "N/A":
        return None

    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [None]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))
movie_info_list[50]

In [23]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [24]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [25]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)


In [26]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")
a == movie_info_list

True

#### Attach IMDB/Rotten Tomatoes/Metascore scores

In [None]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": '97288290', 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("into the woods")

In [29]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [30]:
movie_info_list[-5]

{'title': 'The Parent Trap',
 'Based on': ['Lisa and Lottie', 'by', 'Erich Kästner'],
 'Distributed by': 'The Walt Disney Company',
 'Release date': '1961– present',
 'Running time': '508 minutes (5 films)',
 'Country': 'United States',
 'Language': 'English',
 'Budget': ['$15,000,000', '(1 film)'],
 'Box office': ['<$106,759,044', '(Total of 2 films)'],
 'Running time (int)': 508,
 'Budget (float)': 15000000.0,
 'Box office (float)': 106759044.0,
 'Release date (datetime)': None,
 'imdb': '6.6',
 'metascore': '64',
 'rotten_tomatoes': '87%'}

In [31]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

#### Save data as JSON & CSV

In [32]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [33]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

save_data("disney_data_final.json", movie_info_copy)

In [34]:
import pandas as pd

df = pd.DataFrame(movie_info_list)
df.head()


Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Traditional Chinese,Simplified Chinese,Original title,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
0,Snow White and the Seven Dwarfs,"[David Hand, Perce Pearce, William Cottrell, L...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by the, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Roy Atwell, Pinto Colvig, ...","[Frank Churchill, Leigh Harline, Paul Smith]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...","[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,"[RKO Radio Pictures, Buena Vista Pictures Dist...","[February 7, 1940 ( Center Theatre ), February...",...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,,...,,,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...",,,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,...,,,,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",...,,,,,,,,,,


In [35]:
df.to_csv("disney_movie_data_final.csv")
running_times = df.sort_values(['Running time (int)'],  ascending=False)
running_times.head(20)

Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Traditional Chinese,Simplified Chinese,Original title,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
537,The Parent Trap,,,"[Lisa and Lottie, by, Erich Kästner]",,,,,The Walt Disney Company,,...,,,,,,,,,,
539,Tinker Bell,"[Bradley Raymond (1 & 3), Klay Hall (2), Peggy...",,,,"[Mae Whitman, Lucy Liu, Raven-Symoné, ( More )]",Joel McNeely,DisneyToon Studios,"[Walt Disney Studios, Home Entertainment]","[1, : October 28, 2008, 2, :, 3, :, 4, :, 5, :...",...,,,,,,,,,,
339,Pirates of the Caribbean: At World's End,Gore Verbinski,,[Characters by Ted Elliott Terry Rossio Stuart...,Jerry Bruckheimer,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Hans Zimmer,,Buena Vista Pictures Distribution,"[May 19, 2007 ( Disneyland Resort ), May 25, 2...",...,,,,,,,,,,
85,The Happiest Millionaire,Norman Tokar,A. J. Carothers,"[My Philadelphia Father, by Cordelia Drexel Bi...","[Walt Disney, Bill Anderson]","[Fred MacMurray, Tommy Steele, Greer Garson, G...",Jack Elliott,Walt Disney Productions,Buena Vista Distribution,"[June 23, 1967, November 30, 1967]",...,,,,,,,,,,
450,Jagga Jasoos,Anurag Basu,Anurag Basu,,"[Siddharth Roy Kapur, Anurag Basu, Ranbir Kapoor]","[Ranbir Kapoor, Katrina Kaif, Saswata Chatterj...",Pritam,,UTV Motion Pictures,,...,,,,,,,,,,
443,Dangal,Nitesh Tiwari,"[Curation:, Nitesh Tiwari, Concept:, Divya V. ...",,"[Aamir Khan, Kiran Rao, Siddharth Roy Kapur]","[Aamir Khan, Sakshi Tanwar, Fatima Sana Shaikh...",Pritam,,UTV Motion Pictures,"[21 December 2016 (United States), 23 December...",...,,,,,,,,,,
475,Hamilton,Thomas Kail,,"[Alexander Hamilton, by, Ron Chernow]","[Thomas Kail, Lin-Manuel Miranda, Jeffrey Seller]","[Daveed Diggs, Renée Elise Goldsberry, Jonatha...",Lin-Manuel Miranda,,Walt Disney Studios Motion Pictures,,...,,,,,,,,,,
431,ABCD 2,Remo D'Souza,Remo D'Souza,,Siddharth Roy Kapur,"[Prabhu Deva, Varun Dhawan, Shraddha Kapoor, S...",Sachin–Jigar,Walt Disney Pictures,UTV Motion Pictures,,...,,,,,,,,,,
513,Indiana Jones and the Dial of Destiny,James Mangold,,"[George Lucas, Philip Kaufman]","[Kathleen Kennedy, Frank Marshall, Simon Emanuel]","[Harrison Ford, Phoebe Waller-Bridge, Antonio ...",John Williams,,"[Walt Disney Studios, Motion Pictures]","[May 18, 2023 ( Cannes ), June 30, 2023 (Unite...",...,,,,,,,,,,
330,Pirates of the Caribbean: Dead Man's Chest,Gore Verbinski,,[Characters by Ted Elliott Terry Rossio Stuart...,Jerry Bruckheimer,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Hans Zimmer,,Buena Vista Pictures Distribution,"[June 24, 2006 ( Disneyland Resort ), July 7, ...",...,,,,,,,,,,
