In [1]:
from bs4 import BeautifulSoup
import json
import os
import pickle

In [2]:
# extract the json from inside the HTML files
def extract_json():
    path = "./data/detail"
    count = 0
    details_dict = {}

    # https://stackoverflow.com/questions/22394235/invalid-control-character-with-python-json-loads

    for name in os.listdir(path):
        count += 1
        if count % 100 == 0:
            print(f"Progress: {count}")
        with open(os.path.join(path, name), 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f)
            data = json.loads(soup.find("script", type="application/ld+json").contents[0], strict=False)
            details_dict[name.split(".")[0]] = data

    return details_dict

In [3]:
details_dict_path = "./data/details_dict.pkl"

if os.path.isfile(details_dict_path):
    with open(details_dict_path, "rb") as f:
        details_dict = pickle.load(f)
else:
    details_dict = extract_json()
    with open("./data/details_dict.pkl", "wb") as f:
        pickle.dump(details_dict, f)

len(details_dict)

3184

In [4]:
# remove where no director, author or actor, or no date

to_remove = []

for k, v in details_dict.items():
    if len(v["director"]) == 0 or len(v["author"]) == 0 or len(v["actor"]) == 0:
        to_remove.append(k)

for k in to_remove:
    del details_dict[k]

len(details_dict)

3184

In [5]:
# save if needed
if len(to_remove) != 0:
    print("Saving")
    with open("./data/details_dict.pkl", "wb") as f:
        pickle.dump(details_dict, f)

In [6]:
celebrities = set()

for data in details_dict.values():
    for d in data["director"][0:2]:
        celebrities.add(d["url"])
    for a in data["author"][0:3]:
        celebrities.add(a["url"])
    # only pick the lead actors otherwise too many to scrape
    for a in data["actor"][0:5]:
        celebrities.add(a["url"])

len(celebrities)

10946

In [7]:
redundant = []

for name in os.listdir("./data/celeb"):
    if f"/celebrity/{name.split('.')[0]}/" not in celebrities:
        redundant.append(name)

len(redundant)

1337

In [8]:
# 张艺谋
"/celebrity/1054398/" in celebrities

True

In [9]:
# Jay Chou
"/celebrity/1048000/" in celebrities

True

In [10]:
# 章子怡
"/celebrity/1041014/" in celebrities

True

In [11]:
# 10 thousand... this will take some time

import asyncio
import aiohttp
from common import get_headers, headers, proxies
import random
import time

In [12]:
# cf. 00c

def get_next_proxy():
    try:
        return f"http://{next(get_next_proxy.proxy_it)}"
    except StopIteration:
        get_next_proxy.proxy_it = iter(proxies)
        print("Proxies list exhausted, reverting to None")
        return None

get_next_proxy.proxy_it = iter(proxies)

In [None]:
proxy = None
start_time = time.time()
count = 0

async with aiohttp.ClientSession(headers=headers, timeout=aiohttp.ClientTimeout(total=60)) as session:
    for c in celebrities:
        i = c.split('/')[2]
        name = f"./data/celeb/{i}.html"
        count += 1
        if os.path.isfile(name):
            continue

        while True:
            try:
                async with session.get("https://movie.douban.com" + c, proxy=proxy) as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        if not result.startswith("<script>"):
                            with open(name, 'w', encoding="utf-8") as f:
                                f.write(result)
                            break
                        print("Response starts with script tag")
                    else:
                        print(f"Wrong response code: {resp.status}")
            except Exception as e:
                print(f"Exception: {e}")
            # proxy = get_next_proxy()
            # print(f"Something went wrong, switching proxy to {proxy}")
            print("Something went wrong, pausing scrape for a minute")
            time.sleep(60)

        print(f"Scraped {i}, speed {time.time() - start_time}, {len(celebrities) - count} to go")
        start_time = time.time()
        time.sleep(40 * random.random() + 10)

Scraped 1355790, speed 2.85418701171875, 10936 to go
Scraped 1428964, speed 10.680102348327637, 10935 to go
Scraped 1370257, speed 22.542234897613525, 10931 to go
Scraped 1314222, speed 16.908706665039062, 10927 to go
Exception: 
Something went wrong, pausing scrape for a minute
Scraped 1335319, speed 144.7925786972046, 10917 to go
Scraped 1386749, speed 41.69868731498718, 10912 to go
Scraped 1352562, speed 40.28582191467285, 10903 to go
Scraped 1339123, speed 41.19422650337219, 10899 to go
Scraped 1351796, speed 36.32282114028931, 10871 to go
Scraped 1374637, speed 47.8253448009491, 10865 to go
Scraped 1327348, speed 37.04560303688049, 10858 to go
Scraped 1339373, speed 25.579667568206787, 10853 to go
Scraped 1275050, speed 51.898558616638184, 10852 to go
Scraped 1343340, speed 18.84638214111328, 10847 to go
Scraped 1315998, speed 22.976101398468018, 10842 to go
Scraped 1353189, speed 44.01304793357849, 10841 to go
Scraped 1371540, speed 29.286895513534546, 10823 to go
Scraped 1348755

In [18]:
# check number of scraped files is correct
_, _, files = next(os.walk("./data/celeb"))
len(files) - len(redundant)

10944

# Parallel Approach
Below I tried using a parallel approach that, instead of rotating proxies, runs an array of concurrent jobs that spams all the proxies at once, removing jobs from a "queue" (in fact a set) as necessary. Unfortunately many proxies obtained from the internet were non-functional, otherwise this approach would be significantly faster.

In [13]:
# remove from set those already scraped
already_scraped = []

for c in celebrities:
    i = c.split('/')[2]
    name = f"./data/celeb/{i}.html"
    if os.path.isfile(name):
        already_scraped.append(c)

for c in already_scraped:
    celebrities.remove(c)

len(celebrities)

870

In [14]:
# these two lead to http 500 errors
celebrities.remove("/celebrity/1314395/")
celebrities.remove("/celebrity/1316349/")
# for these two we can use the mobile website version i.e. https://m.douban.com/movie/celebrity/1316349/

In [15]:
def get_next_celeb():
    celeb_it = iter(celebrities)
    return next(celeb_it)

In [16]:
async def proxy_get(proxy, session):
    h = get_headers()
    while len(celebrities) > 0:
        celeb = get_next_celeb()
        i = celeb.split('/')[2]
        name = f"./data/celeb/{i}.html"
        try:
            async with session.get("https://movie.douban.com" + celeb, headers=h, proxy=f"http://{proxy}") as resp:
                if resp.status == 200:
                    result = await resp.text()
                    if not result.startswith("<script>"):
                        with open(name, 'w', encoding="utf-8") as f:
                            f.write(result)
                        celebrities.remove(celeb)
                        print(f"Scraped {i}, proxy {proxy}, {len(celebrities)} left")
                        continue
                print(f"Error: {resp.status}, proxy: {proxy}")
        except:
            pass
        print(f"Something went wrong, pausing scrape for {proxy}")
        # cannot be time.sleep here because that's blocking
        await asyncio.sleep(300)

In [17]:
# using a parallel instead of sequential approach in 00c
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60)) as session:
    futures = []
    for p in proxies:
        futures.append(proxy_get(p, session))
    results = await asyncio.gather(*futures)
    # print(f"{sum(results)}/{len(results)} successes in pass, time taken {time.time() - start_time}, {len(celebrities)} celebrities to go")

Something went wrong, pausing scrape for 05.189.229.42:1081
Something went wrong, pausing scrape for 80.48.119.28:8080
Something went wrong, pausing scrape for 151.106.17.122:1080
Something went wrong, pausing scrape for 80.179.140.189:80
Something went wrong, pausing scrape for 74.208.205.5:80
Something went wrong, pausing scrape for 169.57.1.85:8123
Something went wrong, pausing scrape for 146.59.83.187:80
Something went wrong, pausing scrape for 50.87.181.51:80
Something went wrong, pausing scrape for 157.100.26.69:80
Something went wrong, pausing scrape for 67.212.186.100:80
Something went wrong, pausing scrape for 185.51.10.19:80
Something went wrong, pausing scrape for 151.106.17.123:1080
Something went wrong, pausing scrape for 67.212.186.102:80
Something went wrong, pausing scrape for 187.217.54.84:80
Something went wrong, pausing scrape for 216.137.184.253:80
Something went wrong, pausing scrape for 201.217.49.2:80
Something went wrong, pausing scrape for 158.69.71.245:9300
Sc