In [2]:
from urllib.request import urlopen as uReq
import bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
def get_page_soup(id):
    my_url = f"https://www.themoviedb.org/movie/{id}"
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()
    return BeautifulSoup(page_html, "html.parser")

In [4]:
def get_page_data(id, title):
    page_soup = get_page_soup(id)

    try:
        poster_link = list(page_soup.findAll('img', {"class": "poster"}))[0].get('data-src')
        img_name = download_img(poster_link, title)

        overview = page_soup.find('div', {'class': 'overview'}).find('p').text

        category_span = page_soup.find('span', {'class': 'genres'})
        categories = [a.text for a in category_span.findAll('a')]

        trailer_url = page_soup.find('a', {'class': 'no_click play_trailer'}).get("data-id")
    except:
        return None

    return {'overview': overview, 'categories': categories, 'poster': img_name, 'trailer': trailer_url}

In [5]:
def download_img(img_url, title):
    my_url = f'https://image.tmdb.org/t/p/w342/{img_url}'
    response = requests.get(my_url, stream=True)
    file_size = int(response.headers.get("Content-Length", 0))
    file_name = img_url.split('/')[-1].split('.')[0] + "_" + title.replace(' ', '_') + '.jpg'
    with open('posters/' + file_name, "wb") as f:
        for data in response.iter_content(1024):
            f.write(data)
    return file_name


In [6]:
def get_data(id):
    movie_response = requests.get(f'{API_URL}/{id}{API_KEY}').json()
    title = movie_response['title']
    overview = movie_response['overview']
    poster_path = movie_response['poster_path']
    poster_path = download_img(poster_path, title)

    video_response = requests.get(f'{API_URL}/{id}/videos{API_KEY}').json()
    video_key = None
    for vid in video_response['results']:
        if vid['type'] == 'Trailer':
            video_key = vid['key']
    
    if video_key is None:
        return None

    return pd.Series({'id': id, 'title': title, 'overview': overview, 'poster_path': poster_path, 'video_key': video_key})

In [7]:
links_df = pd.read_csv('ml-latest-small/links.csv')

In [8]:
API_KEY = '?api_key=9fe15a0c0fc071b3fab1b7cb6cf61dee'
API_URL = 'https://api.themoviedb.org/3/movie/'

In [9]:
res = links_df.loc[:10, 'tmdbId'].apply(get_data)