In [88]:
from scrapy.http import HtmlResponse
from scrapy import Selector

import requests

from tqdm.auto import tqdm
import json

In [6]:
START_URL = 'https://www.imdb.com/search/name/?gender=male%2Cfemale&ref_=nv_cel_m'

BASE_URL = "https://www.imdb.com"

In [14]:
def get_info_top_50(start_url, base_url=BASE_URL):
    req = requests.get(start_url)
    response = HtmlResponse(url=start_url, body=req.content)
    rows = response.xpath('//*[@id="main"]/div/div[@class="lister-list"]/div[@class="lister-item mode-detail"]')
    links = []

    for col in rows:
        rel_url = col.xpath("./div[@class='lister-item-content']/h3/a/@href").extract_first().strip()
        actor_url = base_url + rel_url + '/'
        links.append(actor_url)
    return links

In [16]:
def get_info_about_one_actor(actor_url, base_url=BASE_URL, top_k_films=15):
    req = requests.get(actor_url)
    response = HtmlResponse(url=actor_url, body=req.content)
    name_widget = response.xpath('//*[@id="name-overview-widget-layout"]')
    item = dict()
    item['name'] = name_widget.xpath('./tbody/tr[1]/td/h1/span/text()').extract_first().strip()
    item['bio'] = ''.join(
        name_widget.xpath('//*[@id="name-bio-text"]/div/div//text()').extract()
    ).replace('See full bio »', ' ').strip()
    
    try:
        item['born'] = name_widget.xpath('//*[@id="name-born-info"]/time/@datetime').extract_first().strip()
    except AttributeError as e:
        item['born'] = None
    item['url'] = response.url
    films = response.xpath(
        '//*[@id="filmography"]/div[@class="filmo-category-section"][1]/div[contains(@class, "filmo-row")]'
    )
    item['movies'] = []
    item['movie_links'] = []
    for movie in films[:top_k_films]:
        rel_url = movie.xpath("./b[1]/a/@href").extract_first().strip()
        movie_url = base_url + rel_url
        ttl = movie.xpath("./b[1]/a/text()").extract_first().strip()
        item['movies'].append(ttl)
        item['movie_links'].append(movie_url)
    
    return item

In [81]:
def get_info_about_one_movie(movie_url):
    req = requests.get(movie_url)
    response = HtmlResponse(url=movie_url, body=req.content)
    movie_name = response.xpath('//*[@class="title_wrapper"]/h1/text()').extract_first().strip()
    item = dict()
    item['url'] = movie_url
    item['title'] = movie_name
    cast = response.xpath('//*[@class="cast_list"]//tr//@title').extract()
    cast = [i.strip() for i in cast if i]
    item['cast'] = cast
    return item

In [17]:
actor_links = get_info_top_50(START_URL)

In [21]:
actor_data = [get_info_about_one_actor(cur_url) for cur_url in tqdm(actor_links)]

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [72]:
flatten = lambda t: [item for sublist in t for item in sublist]
all_movies_links = flatten([cur_item['movie_links'] for cur_item in actor_data])

In [84]:
movie_data = [get_info_about_one_movie(movie_url) for movie_url in tqdm(all_movies_links)]

HBox(children=(IntProgress(value=0, max=680), HTML(value='')))




In [94]:
def dump_as_jl(inp_list_of_dicts, out_path):
    with open(out_path, 'w', encoding='utf-8') as fout:
        for line in inp_list_of_dicts:
            tmp_str = json.dumps(line, ensure_ascii=False) + '\n'
            fout.write(tmp_str)

In [95]:
dump_as_jl(actor_data, 'actors.jl')

In [96]:
dump_as_jl(movie_data, 'movies.jl')