## Scrape Disney Movie Data From Wikipedia

Import Necesseray Libraries

In [6]:
from bs4 import BeautifulSoup
import requests

Load the webpage

### Get info box for all movies 

In [7]:
##Clean up references tags [1], [2]
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()


def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def get_info_box(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    info_box = soup.find(attrs={"class" : "infobox vevent"})
    info_rows = info_box.find_all("tr")

    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info["title"] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find("th")
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value

    return movie_info

In [None]:
main_page = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup_main = BeautifulSoup(main_page.text, "html.parser")
movies = soup_main.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie["href"]
        full_path = base_path + relative_path
        title = movie["title"]

        movie_info_list.append(get_info_box(full_path))

    except Exception as e:
        print(movie.get_text())
        print(e)



In [None]:
len(movie_info_list)

548

In [23]:
import json
def save_data(title, data):
    with open(title, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [None]:
save_data("disney_data_cleaned.json", movie_info_list)

### Clean data

#### Subtasks
    - ~Clean up references [1]~
    - ~~Conver running time into an integer~~
    - Convert dates into datetime object
    - ~~Split up the long strings~~
    - Convert Budget & Box Office to numbers

In [9]:
movie_info_list = load_data("disney_data_cleaned.json")

In [10]:
## running time to int
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])
    
for movie in movie_info_list:
    movie["Running_time_int"] = minutes_to_integer(movie.get("Running time", "N/A"))

In [11]:
movie_info_list[-10]

{'title': 'The Hunchback of Notre Dame',
 'Directed by': ['Gary Trousdale', 'Kirk Wise'],
 'Screenplay by': ['Tab Murphy',
  'Irene Mecchi',
  'Bob Tzudiker',
  'Noni White',
  'Jonathan Roberts'],
 'Story by': 'Tab Murphy',
 'Based on': ['The Hunchback of Notre-Dame', 'by', 'Victor Hugo'],
 'Produced by': 'Don Hahn',
 'Starring': ['Tom Hulce',
  'Demi Moore',
  'Tony Jay',
  'Kevin Kline',
  'Paul Kandel',
  'Jason Alexander',
  'Charles Kimbrough',
  'Mary Wickes',
  'David Ogden Stiers'],
 'Edited by': 'Ellen Keneshea',
 'Music by': 'Alan Menken',
 'Production company': 'Walt Disney Feature Animation',
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release dates': ['June 19, 1996 ( Louisiana Superdome )',
  'June 21, 1996 (United States)'],
 'Running time': '91 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$70 million',
 'Box office': '$325.3 million',
 'Running_time_int': 91}

In [12]:
##Convert budget to numeric
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|—)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value * word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def money_conversion(money):
    if money == "N/A":
        return None
    
    if isinstance(money, list):
        money = money[0]

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None

In [15]:
movie_info_list[-10].get("Budget", "N/A")

'$70 million'

In [16]:
for movie in movie_info_list:
    movie["Budget (float)"] = money_conversion(movie.get("Budget", "N/A"))
    movie["Box_office (float)"] = money_conversion(movie.get("Box office", "N/A"))

In [17]:
movie_info_list[-10].get("Budget (float)", "N/A")

70000000.0

In [32]:
# Convert dates into datetimes

from datetime import datetime

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):

    if isinstance(date, list):
        date = date[0]

    if date == "N/A":
        return None
    
    date_str = clean_date(date)

    frmts = ["%B %d, %Y", "%d %B %Y"]

    for frmt in frmts:
        try:
            return datetime.strptime(date_str, frmt)
        except:
            pass
    return None

In [35]:
for movie in movie_info_list:
    movie["Release_date (datetime)"] = date_conversion(movie.get("Release date", "N/A"))

In [37]:
movie_info_list[-30]

{'title': 'Tiger',
 'Directed by': ['Mark Linfield', 'Vanessa Berlowitz', 'Rob Sullivan'],
 'Produced by': ['Mark Linfield', 'Vanessa Berlowitz', 'Roy Conli'],
 'Narrated by': 'Priyanka Chopra',
 'Cinematography': ['Martyn Colbeck',
  'Mark MacEwen',
  'Simon Niblett',
  'Kalyan Varma',
  'Tom Walker'],
 'Music by': 'Nitin Sawhney',
 'Production company': 'Disneynature',
 'Distributed by': 'Disney+',
 'Release date': ['April 22, 2024'],
 'Running time': '90 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running_time_int': 90,
 'Budget (float)': None,
 'Box_office (float)': None,
 'Release_date (datetime)': datetime.datetime(2024, 4, 22, 0, 0)}

In [3]:
import pickle

def save_data_pickle(name, data):
    with open(name, "wb") as f:
        return pickle.dump(data, f)
    
def load_data_pickle(name):
    with open(name, "rb") as f:
        return pickle.load(f)

In [41]:
save_data_pickle("disney_data_cleaned_v2.pickle", movie_info_list)

### Attach IMDB/Rotten Tomatoes scores

In [8]:
movie_info_list = load_data_pickle("disney_data_cleaned_v2.pickle")

In [14]:
import urllib.parse
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ["OMDB_API_KEY"], "t":  title, "type": "movie"}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomatoes_scores(omdb_info):
    ratings = omdb_info.get("Ratings", [])
    for rating in ratings:
        if rating["Source"] == "Rotten Tomatoes":
            return rating["Value"]
    
    return None


In [12]:
get_omdb_info("Tiger")

{'Title': 'Tiger',
 'Year': '2017',
 'Rated': 'N/A',
 'Released': '16 Jun 2017',
 'Runtime': '155 min',
 'Genre': 'Action, Drama, Thriller',
 'Director': 'Nanda Kishore',
 'Writer': 'Yoganandh Muddhan, Udhay Pottipadu, Tarun Sudhir',
 'Actors': 'Pradeep Bogadi, Nyra Banerjee, K. Shivaram',
 'Plot': 'Tiger is an action packed thriller film led by Pradeep who dreams to become a police officer which is against the wish of his father played by the well known actor and also a very famous Ex IAS officer K Shivram.',
 'Language': 'Kannada',
 'Country': 'India',
 'Awards': 'N/A',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMmVhNjNlZjQtOWYyMC00MzJjLThlM2ItYzhjZTI0NzI0OGRmXkEyXkFqcGdeQXVyMzU0ODc1MTQ@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '6.7/10'}],
 'Metascore': 'N/A',
 'imdbRating': '6.7',
 'imdbVotes': '1,166',
 'imdbID': 'tt5995602',
 'Type': 'movie',
 'DVD': 'N/A',
 'BoxOffice': 'N/A',
 'Production': 'N/A',
 'Website': 'N/A',
 'Response': 'True'

In [None]:
for index, movie in enumerate(movie_info_list):
    if index % 10 == 0:
        print(index)

    title = movie["title"]
    omdb_info = get_omdb_info(title)
    movie["imdb"] = omdb_info.get("imdbRating", "None")
    movie["Metascore"] = omdb_info.get("Metascore", "None")
    movie["Genre"] = omdb_info.get("Genre", "None")
    movie["Plot"] = omdb_info.get("Plot", "None")
    movie["rotten_tomatoes"] = get_rotten_tomatoes_scores(omdb_info)

In [17]:
save_data_pickle("disney_movie_data_final.pickle", movie_info_list)

### Save data as JSON/CSV

In [None]:
movie_info_list = load_data_pickle("disney_movie_data_final.pickle")

In [19]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [20]:
for movie in movie_info_copy:
    current_date = movie["Release_date (datetime)"]
    if current_date:
        movie["Release_date (datetime)"] = current_date.strftime("%B %d, %Y")
    else:
        movie["Release_date (datetime)"] = None

Save file as JSON

In [24]:
save_data("disney_data_final.json", movie_info_copy)

Conver data to CSV

In [25]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [26]:
df.head()

Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Music by,Production company,Distributed by,Release dates,Running time,...,Traditional Chinese,Simplified Chinese,Original title,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
0,Snow White and the Seven Dwarfs,"[Perce Pearce, William Cottrell, Larry Morey, ...","[Ted Sears, Richard Creedon, Otto Englander, D...","["", Snow White, "", by the, Brothers Grimm]",Walt Disney,"[Frank Churchill, Leigh Harline, Paul Smith]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",83 minutes,...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...","[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]",See plot,Walt Disney Productions,RKO Radio Pictures,,126 minutes,...,,,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...",,,Walt Disney,"[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,74 minutes,...,,,,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",64 minutes,...,,,,,,,,,,


In [27]:
df.to_csv("disney_data_final.csv")