In [None]:
import os
import httpx
import asyncio
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup as bs

In [None]:
BASE_URL = 'https://en.wikipedia.org'
DISNEY_URL = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'

In [None]:
# Retrieve content in url
# 'https://en.wikipedia.org/wiki/The_Great_Locomotive_Chase'
# 'https://en.wikipedia.org/wiki/Toy_Story_3'
response = httpx.get('https://en.wikipedia.org/wiki/The_Great_Locomotive_Chase')
print(response)

In [None]:
# Get HTML content of Page
html = response.content
# Turn the HTML to a beautiful soup object
soup = bs(html)

In [None]:
info_box = soup.find(class_="infobox vevent") # ''
print(info_box.prettify())

In [None]:
def replace_string(string):
    return string.replace('\xa0', ' ')


def find_html(row, tag = 'th'):
    return row.find(tag).get_text(' ', strip=True)


def get_field_data(row):
    table_data = row.find('td')
    if table_data.find('li'):
        producers = [replace_string(producer.get_text(' ', strip=True))for producer in table_data.find_all('li')]
        return producers
        
    elif table_data.find('br'):
        return [text for text in table_data.stripped_strings]

    return replace_string(find_html(row, tag='td'))

def remove_tags(soup):
    tags = soup.find_all(['sup', 'span'])
    for tag in tags:
        tag.decompose()   

movie_info = {}
info_table = info_box.find_all('tr')
remove_tags(info_box)
for index, row in enumerate(info_table):
    if index == 0:
        movie_info['Title'] = find_html(row)
    elif index == 1:
        continue
    else:
        title = find_html(row)
        data = get_field_data(row)
        movie_info[title] = data
# ''
movie_info

In [None]:
def get_italics_tag(tags):
    urls = [tag["href"] for tag in tags]
    return urls


def get_disney_urls(url):
    
    response = httpx.get(url)
    walt_disney_html = response.content
    walt_disney_bs = bs(walt_disney_html)
    # walt_disney_bs.prettify()
    # table = walt_disney_bs.find_all('table', {'class': 'wikitable sortable'})
    italics_tag = walt_disney_bs.select(".wikitable.sortable i a")
    urls = get_italics_tag(italics_tag)
    
    return urls

url_list = get_disney_urls(DISNEY_URL)

In [None]:
print(len(url_list))

url_list

In [None]:
#%%writefile concurrent_helper2.py
import httpx
import asyncio


BASE_URL = 'https://en.wikipedia.org'
DISNEY_URL = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'

def get_info_box(movies_bs):
    
    info_box = movie_bs.find(class_="infobox vevent")
    remove_tags(info_box)
    info_table = info_box.find_all('tr')

    movie_info = {}
    for index, row in enumerate(info_table):
        if index == 0:
            movie_info['Title'] = find_html(row)
        else:
            header = row.find('th')
            if header:
                title = find_html(row)
                data = get_field_data(row)
                movie_info[title] = data
    return  movie_info

    
async def get_movie_content(url, client: httpx.AsyncClient, semaphore: asyncio.Semaphore):
    url = f'{BASE_URL}{url}'
    async with semaphore:
        response = await client.get(url, timeout=6.1, 
                                    follow_redirects=True )
    movie_html = response.content
    movie_bs = bs(movie_html)
    movie_info = None
    try:
        movie_info = get_info_box(movie_bs)
    except Exception as e:
        print(f'Error: {e}')

    return movie_info


async def get_many_movie_content(urls):
    semaphore = asyncio.Semaphore(16)
    async with httpx.AsyncClient() as client:
        to_do = [get_movie_content(url, client, semaphore) for url in urls]
        results = await asyncio.gather(*to_do)

    return results


def get_page_html(urls):
    return asyncio.run(get_many_movie_content(urls))
    

def main():
    disney_urls = get_disney_urls(DISNEY_URL)
    results = get_page_html(disney_urls)
    print(results)

In [None]:
from utils.utils import clean_minutes, clean_date
from utils.conversion import money_conversion, parse_date_string
from utils.data_utils import load_json

data_path = './movie_datasets/'
filename = 'disney_movies.json'
movies = load_json(f'{data_path}{filename}')

In [None]:
print(len(movies))
movies

In [None]:
for movie in movies:
    movie['Box office'] = money_conversion(movie.get('Box office', 'N/A'))
    movie['Budget'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Running time'] = clean_minutes(movie.get('Running time', 'N/A'))
    

In [None]:
movies[:3]

In [None]:
# ['November 13, 1940']
#['February 7, 1940 ( Center Theatre )
# Clean Date and convert to datetimr object
import re
#"13 March 1952 (London)
#"June 27, 1941"
#1948-1960

regex = r'\w+\s\d{1,2},\s\d{4}|\d{1,2}\s\w+\s\d{4}|\d{4}'

def clean_date(value):
    if isinstance(value, list) and len(value) > 1:
        # Get release date for US
        value = value[1]
    elif isinstance(value, list) and len(value) == 1:
        value = value[0]
    
    value = re.search(regex, value).group()
    
    return value

In [None]:
re.search(regex,"13 March 1952").group()

In [None]:
string = clean_date([
            "June 27, 1941"
        ])
string

In [None]:
# for i, movie in enumerate(movies):
#     #print(i)
#     for key in movie.copy().keys():
#         if key == 'Release date' or key == 'Release dates':
#             movie['Release date'] = clean_date(movie[key])
#             continue
    

In [None]:
for i, movie in enumerate(movies):
    t = movie.get('Release dates', 'N/A')
    if t =='N/A':
        movie['Release date'] = clean_date(movie.get('Release date', 'N/A'))
    else:
        movie['Release date'] = clean_date(movie.get('Release dates', 'N/A'))

In [None]:
movies[:10]

In [None]:
v = parse_date_string('01:2024:13')   

In [None]:
for movie in movies:
    movie['Release date'] = parse_date_string(movie.get('Release date', 'N/A' ))

In [None]:
movies[:10]

In [None]:
save_pickle(full_path, movies)

In [None]:
# Save to pickle since datetime object is not serializable
from utils.data_utils import save_pickle, save_json, load_json, load_pickle
data_path = 'movie_datasets/'
filename = 'disney_movies.pickle'
full_path = f'{data_path}{filename}'


In [None]:
movies = load_pickle(full_path)

In [None]:
len(movies)

In [None]:
from utils.data_utils import get_omdb_info
get_omdb_info('The Lion King')

In [None]:
movies

In [None]:
def get_movie_titles(movie_data):
    movie_titles = []
    for movie in movies:
        title = movie.get('Title', 'N/A')
        movie_titles.append(title)

    return movie_titles

In [None]:
movie_titles = get_movie_titles(movies)
movie_titles

In [None]:
from scraper.omdb import omdb_main, omdb_file
import asyncio

await omdb_main(movies)

In [None]:
omdb_file = 'omdb_movies.pickle'
full_path = f'{data_path}{omdb_file}'
omdb_movie = load_pickle(full_path)

In [None]:
len(omdb_movie)

In [None]:
omdb_movie[:5]

In [None]:
movies[:5]

In [None]:
def get_rotten_tomoatoes_score(data, key='Ratings'):
    ratings = data.get(key, None)
    if ratings is None:
        return None
    for dict_ in ratings:
        rotten_tomoatoes_score = dict_.get('Source', None)
        if rotten_tomoatoes_score == 'Rotten Tomatoes':
            value = dict_.get('Value', None)
            return value
          
value = get_rotten_tomoatoes_score(omdb_movie[1])
value

In [None]:

for mine, omdb in zip(movies, omdb_movie):
    mine['imdb'] = omdb.get('imdbRating', None)
    mine['imdbVotes'] = omdb.get('imdbVotes', None)
    mine['Metascore'] = omdb.get('Metascore', None)
    mine['Rotten Tomatoes'] = get_rotten_tomoatoes_score(omdb)

In [None]:
movies[:5]

In [None]:
pickle_filename = 'disney_mine_omdb.pickle'
pickle_path = f'{data_path}{pickle_filename}'

In [None]:
# Not serializable
# json_filename = 'disney_mine_omdb.json'
# json_path = f'{data_path}{json_filename}'
# save_json(json_path, movies)
import pandas as pd

In [None]:
df = pd.DataFrame(movies)
df.head(5).columns

In [None]:
df.to_pickle(pickle_path)

In [None]:
import pandas as pd
df = pd.read_pickle(pickle_path)
df.head().columns