In [1]:
from bs4 import BeautifulSoup
import json
import os
import pickle

In [2]:
# extract the json from inside the HTML files
def extract_json():
    path = "./data/detail"
    count = 0
    details_dict = {}

    # https://stackoverflow.com/questions/22394235/invalid-control-character-with-python-json-loads

    for name in os.listdir(path):
        count += 1
        if count % 100 == 0:
            print(f"Progress: {count}")
        with open(os.path.join(path, name), 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f)
            data = json.loads(soup.find("script", type="application/ld+json").contents[0], strict=False)
            details_dict[name.split(".")[0]] = data

    return details_dict

In [10]:
details_dict_path = "./data/details_dict.pkl"

if os.path.isfile(details_dict_path):
    with open(details_dict_path, "rb") as f:
        details_dict = pickle.load(f)
else:
    details_dict = extract_json()
    with open("./data/details_dict.pkl", "wb") as f:
        pickle.dump(details_dict, f)

len(details_dict)

4289

In [4]:
celebrities = set()

for data in details_dict.values():
    for d in data["director"][0:1]:
        celebrities.add(d["url"])
    for a in data["author"][0:2]:
        celebrities.add(a["url"])
    # only pick the lead actors otherwise too many to scrape
    for a in data["actor"][0:4]:
        celebrities.add(a["url"])

len(celebrities)

10833

In [5]:
# 张艺谋
"/celebrity/1054398/" in celebrities

True

In [6]:
# Jay Chou
"/celebrity/1048000/" in celebrities

True

In [7]:
# 章子怡
"/celebrity/1041014/" in celebrities

True

In [8]:
# 10 thousand... this will take some time

import asyncio
import aiohttp
from common import headers, proxies
import random
import time

In [9]:
# remove from set those already scraped
already_scraped = []

for c in celebrities:
    i = c.split('/')[2]
    name = f"./data/celeb/{i}.html"
    if os.path.isfile(name):
        already_scraped.append(c)

for c in already_scraped:
    celebrities.remove(c)

len(celebrities)

8896

In [11]:
async def get_celeb(proxy, celeb, session):
    i = celeb.split('/')[2]
    name = f"./data/celeb/{i}.html"
    try:
        async with session.get("https://movie.douban.com" + celeb, proxy=f"http://{proxy}") as resp:
            if resp.status == 200:
                result = await resp.text()
                if not result.startswith("<script>"):
                    with open(name, 'w', encoding="utf-8") as f:
                        f.write(result)
                    celebrities.remove(celeb)
                    print(f"Scraped {i}, proxy {proxy}")
                    return True
            print(f"Error: {resp.status}, proxy: {proxy}")
    except:
        pass
    return False

In [None]:
# using a parallel instead of sequential approach in 00c
async with aiohttp.ClientSession(headers=headers, timeout=aiohttp.ClientTimeout(total=60)) as session:
    while len(celebrities) > 0:
        start_time = time.time()
        celeb_it = iter(celebrities)
        futures = []
        for p in proxies:
            futures.append(get_celeb(p, next(celeb_it), session))
        results = await asyncio.gather(*futures)
        print(f"{sum(results)} successes in pass, time taken {time.time() - start_time}, {len(celebrities)} movies to go")
        time.sleep(60)

Error: 200, proxy: 175.44.46.3:8118
Error: 200, proxy: 139.9.64.238:443
Scraped 1023418, proxy 47.99.133.26:3128
Error: 200, proxy: 119.122.212.20:9000
Scraped 1340456, proxy 47.100.45.114:80
Error: 403, proxy: 123.56.216.85:22
Error: 403, proxy: 39.96.9.1:8080
Scraped 1333535, proxy 113.87.81.187:8118
Scraped 1315463, proxy 39.107.91.243:22
Error: 403, proxy: 122.70.157.11:808
Scraped 1318284, proxy 58.212.197.222:81
Error: 200, proxy: 114.115.181.74:8080
Error: 403, proxy: 221.215.252.88:9999
Error: 403, proxy: 39.106.69.122:3128
Scraped 1324524, proxy 58.23.212.10:3129
6 successes in pass, time taken 60.474003076553345, 8890 movies to go
Error: 200, proxy: 175.44.46.3:8118
Error: 200, proxy: 139.9.64.238:443
Scraped 1407741, proxy 116.22.31.201:8118
Scraped 1318295, proxy 58.212.197.222:81
Scraped 1332055, proxy 120.79.136.134:8080
Scraped 1369033, proxy 112.232.109.1:8118
Error: 403, proxy: 221.215.252.88:9999
Error: 403, proxy: 39.96.9.1:8080
Error: 403, proxy: 122.70.157.11:808
E

In [19]:
async def test_proxy(session, proxy):
    try:
        async with session.get("https://movie.douban.com", proxy=f"http://{proxy}") as resp:
            if resp.status == 200:
                result = await resp.text()
                if not result.startswith("<script>"):
                    print(f"Found working proxy {proxy}")
                    return proxy
    except:
        pass
    return None

async def get_working_proxies():
    futures = []
    session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=60))

    # issue requests to all proxies in parallel
    for p in proxies:
        futures.append(test_proxy(session, p))

    is_working = await asyncio.gather(*futures)
    working = [i for i in is_working if i is not None]

    await session.close()
    print(f"Finished checking working proxies, list size {len(working)}")
    return working

In [None]:
# cf. 00c

def get_next_proxy():
    try:
        return f"http://{next(get_next_proxy.proxy_it)}"
    except StopIteration:
        get_next_proxy.proxy_it = iter(proxies)
        return get_next_proxy()

get_next_proxy.proxy_it = iter(proxies)

In [None]:
async def get_celeb(celeb, session):
    while True:
        proxy = get_next_proxy()
        try:
            async with session.get("https://movie.douban.com" + celeb, proxy=f"http://{proxy}") as resp:
                if resp.status == 200:
                    result = await resp.text()
                    if not result.startswith("<script>"):
                        with open(name, 'w', encoding="utf-8") as f:
                            f.write(result)
                        break
        except:
            pass

    print(f"Scraped {celeb}, proxy {proxy}")

In [20]:
working_proxies = await get_working_proxies()
refresh_time = time.time()
count = 0

async with aiohttp.ClientSession(headers=headers, timeout=aiohttp.ClientTimeout(total=60)) as session:
    for c in celebrities:
        i = c.split('/')[2]
        name = f"./data/celeb/{i}.html"
        count += 1
        if os.path.isfile(name):
            continue

        while True:
            # select random from working proxies
            proxy = working_proxies[random.randrange(0, len(working_proxies))]
            try:
                async with session.get("https://movie.douban.com" + c, proxy=f"http://{proxy}") as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        if not result.startswith("<script>"):
                            with open(name, 'w', encoding="utf-8") as f:
                                f.write(result)
                            break
                        print("Response starts with script tag")
                    else:
                        print(f"Wrong response code: {resp.status}")
            except Exception as e:
                print(f"Exception: {e}")
            print(f"Something went wrong, refreshing list of proxies")
            working_proxies = await get_working_proxies()
            refresh_time = time.time()

        print(f"Scraped {i}, proxy {proxy}, {len(celebrities) - count} to go")

        # refresh list of proxies every half an hour
        if time.time() - refresh_time > 1800:
            print("Refreshing list of proxies")
            working_proxies = await get_working_proxies()
            refresh_time = time.time()

Found working proxy 47.99.133.26:3128
Found working proxy 117.10.124.222:8118
Found working proxy 222.64.109.23:9000
Found working proxy 119.184.185.80:8118
Found working proxy 112.232.109.1:8118
Found working proxy 113.87.81.187:8118
Found working proxy 221.219.103.88:9000
Found working proxy 221.215.252.88:9999
Found working proxy 39.107.91.243:22
Found working proxy 119.8.183.114:3128
Found working proxy 116.20.228.108:3128
Found working proxy 117.147.111.210:3128
Finished checking working proxies, list size 12
Exception: Server disconnected
Something went wrong, refreshing list of proxies
Found working proxy 139.227.201.203:8118
Found working proxy 221.215.252.88:9999
Found working proxy 47.99.133.26:3128
Found working proxy 121.229.132.241:9999
Found working proxy 113.87.81.187:8118
Found working proxy 222.64.109.23:9000
Found working proxy 117.10.124.222:8118
Found working proxy 119.8.183.114:3128
Found working proxy 116.20.228.108:3128
Finished checking working proxies, list siz

CancelledError: 

In [22]:
# check number of scraped files is correct
_, _, files = next(os.walk("./data/celeb"))
len(files)

1904