## Task 1: Get Info Box, store it to python dictionary

###### Import Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests

##### Load the webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to beautiful soup object
soup = bs(r.content)

# Print out HTML
contents = soup.prettify()
# print(contents)

In [3]:
info_box = soup.find(class_="infobox vevent")
# print(info_box.prettify())

In [4]:
info_rows = info_box.find_all("tr")

# for row in info_rows:
#     print(row.prettify())

In [5]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
        
# movie_info

## Task 2: Get info box for all movies

In [6]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()
# contents

In [7]:
movies = soup.select('.wikitable.sortable i') # i means only italized items
# movies

In [8]:
# print(movies[0])
# print("")
# print(movies[0].a['href']) 

In [9]:
# def get_content_value(row_data):
#     if row_data.find("li"):
#         return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
#     else:
#         return row_data.get_text(" ", strip=True).replace("\xa0", " ")

# def get_info_box(url):
#     r = requests.get(url)

#     soup = bs(r.content)
#     info_box = soup.find(class_="infobox vevent")
#     info_rows = info_box.find_all('tr')
    
#     movie_info = {}
#     for index, row in enumerate(info_rows):
#         if index == 0:
#             movie_info['title'] = row.find("th").get_text(" ", strip=True)
#         elif index == 1:
#             continue
#         else:
#             content_key = row.find("th").get_text(" ", strip=True)
#             content_value = get_content_value(row.find("td"))
#             movie_info[content_key] = content_value
#     return movie_info

#TASK 3 SOLUTION
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for sup in soup.find_all(['sup','span']):
        sup.decompose()
        
def get_info_box(url):
    r = requests.get(url)

    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all('tr')
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    return movie_info

In [10]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)
movies = soup.select('.wikitable.sortable i a')
# movies

In [11]:
base_path = "https://en.wikipedia.org/"
movie_info_list = []

for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        title = movie['title']
        full_path = base_path + relative_path
        
        movie_info_list.append(get_info_box(full_path))
#         print(relative_path)
#         print(title)
#         print()
    except Exception as e:
        print(movie.get_text())
        print(e)
# movie_info_list

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
Night at the Museum: Kahmunrah Rises Again
'NoneType' object has no attribute 'find'


In [20]:
movie_info_list[440]

{'title': 'Encanto',
 'Directed by': ['Byron Howard', 'Jared Bush'],
 'Produced by': ['Clark Spencer', 'Yvett Merino Flores'],
 'Written by': ['Jared Bush', 'Charise Castro Smith'],
 'Starring': 'Stephanie Beatriz',
 'Music by': 'Lin-Manuel Miranda',
 'Production companies': ['Walt Disney Pictures',
  'Walt Disney Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['November 24, 2021'],
 'Country': 'United States',
 'Language': 'English'}

In [13]:
len(movie_info_list)

446

#### Save/Reload Movie Data

In [14]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [15]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [16]:
# save_data('disney_data.json', movie_info_list)
save_data('disney_data_cleaned.json', movie_info_list)

## Task 3: Clean our data

In [17]:
import pandas as pd
df = load_data('disney_data_cleaned.json')
# df

#### Subtasks
- ~~Clean up references [1], [2]~~
- Convert running time into integer
- Convert dates into datetime object
- ~~Split up long string~~
- Convert Budget & Box Office to numbers

In [18]:
## Clean up references [1], [2]
## Just copy this code and paste it in the original methods (above)
## Adding clean_tags methods

# def get_content_value(row_data):
#     if row_data.find("li"):
#         return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
#     else:
#         return row_data.get_text(" ", strip=True).replace("\xa0", " ")

# def clean_tags(soup):
#     for sup in soup.find_all(['sup','span']):
#         sup.decompose()
        
# def get_info_box(url):
#     r = requests.get(url)

#     soup = bs(r.content)
#     info_box = soup.find(class_="infobox vevent")
#     info_rows = info_box.find_all('tr')
    
#     clean_tags(soup)
    
#     movie_info = {}
#     for index, row in enumerate(info_rows):
#         if index == 0:
#             movie_info['title'] = row.find("th").get_text(" ", strip=True)
#         elif index == 1:
#             continue
#         else:
#             content_key = row.find("th").get_text(" ", strip=True)
#             content_value = get_content_value(row.find("td"))
#             movie_info[content_key] = content_value
#     return movie_info

In [21]:
# Split up the long strings
# add elif in get_content_value
# add else in get_info_box

# def get_content_value(row_data):
#     if row_data.find("li"):
#         return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
#     elif row_data.find("br"):
#         return [text for text in row_data.stripped_strings]
#     else:
#         return row_data.get_text(" ", strip=True).replace("\xa0", " ")

# def clean_tags(soup):
#     for sup in soup.find_all(['sup','span']):
#         sup.decompose()
        
# def get_info_box(url):
#     r = requests.get(url)

#     soup = bs(r.content)
#     info_box = soup.find(class_="infobox vevent")
#     info_rows = info_box.find_all('tr')
    
#     clean_tags(soup)
    
#     movie_info = {}
#     for index, row in enumerate(info_rows):
#         if index == 0:
#             movie_info['title'] = row.find("th").get_text(" ", strip=True)
#         else:
#             header = row.find('th')
#             if header:
#                 content_key = row.find("th").get_text(" ", strip=True)
#                 content_value = get_content_value(row.find("td"))
#                 movie_info[content_key] = content_value
#     return movie_info