In [1]:
import pickle
import os

In [2]:
path = "./data/list"

uncleaned_count = 0

movies_dict = {}

for name in os.listdir(path):
    with open(os.path.join(path, name), "rb") as f:
        movies = pickle.load(f)
        uncleaned_count += len(movies)
        for m in movies:
            movies_dict[m["id"]] = m

In [3]:
print(f"Uncleaned count: {uncleaned_count}")
print(f"Duplicates removed count: {len(movies_dict)}")

Uncleaned count: 31467
Duplicates removed count: 17594


In [4]:
# remove movies with no ratings, directors or actors

to_remove = [k for k, v in movies_dict.items() if v["rate"] == "" or len(v["directors"]) == 0 or len(v["casts"]) == 0]

for i in to_remove:
    del movies_dict[i]

print(f"Cleaned count: {len(movies_dict)}")

Cleaned count: 4289


In [5]:
movies_dict["26925317"]

{'directors': ['韩延'],
 'rate': '7.2',
 'cover_x': 2999,
 'star': '35',
 'title': '动物世界',
 'url': 'https://movie.douban.com/subject/26925317/',
 'casts': ['李易峰', '迈克尔·道格拉斯', '周冬雨', '曹炳琨', '王戈'],
 'cover': 'https://img9.doubanio.com/view/photo/s_ratio_poster/public/p2525528688.jpg',
 'id': '26925317',
 'cover_y': 4181}

In [6]:
# 4000 movies to scrape let's go

import aiohttp
from common import headers, proxies
import random
import time

In [7]:
# https://stackoverflow.com/questions/279561/what-is-the-python-equivalent-of-static-variables-inside-a-function

def get_next_proxy():
    try:
        return f"http://{next(get_next_proxy.proxy_it)}"
    except StopIteration:
        get_next_proxy.proxy_it = iter(proxies)
        return get_next_proxy()

get_next_proxy.proxy_it = iter(proxies)

In [8]:
# https://stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters

proxy = get_next_proxy()
start_time = time.time()
count = 0

async with aiohttp.ClientSession(headers=headers, timeout=aiohttp.ClientTimeout(total=60)) as session:
    for i, m in movies_dict.items():
        name = f"./data/detail/{i}.html"
        count += 1
        if os.path.isfile(name):
            continue

        # try until no exception or response is 200
        # yeah the nesting is pretty jank but whatever
        while True:
            try:
                async with session.get(m["url"], proxy=proxy) as resp:
                    if resp.status == 200:
                        result = await resp.text()
                        if not result.startswith("<script>"):
                            with open(name, 'w', encoding="utf-8") as f:
                                f.write(result)
                            break
                        print("Response starts with script tag")
                    else:
                        print(f"Wrong response code: {resp.status}")
            except Exception as e:
                print(f"Exception: {str(e)}")
            proxy = get_next_proxy()
            print(f"Something went wrong, switching proxy to {proxy}")

        print(f"Scraped {m['title']}, id {i}, time elapsed {time.time() - start_time}, {len(movies_dict) - count} to go")
        time.sleep(random.random())

Exception: 400, message='Bad Request', url=URL('http://113.194.88.13:9091')
Something went wrong, switching proxy to http://219.138.229.131:9091
Exception: 400, message='Bad Request', url=URL('http://219.138.229.131:9091')
Something went wrong, switching proxy to http://115.29.46.136:3128
Exception: Cannot connect to host 115.29.46.136:3128 ssl:default [Connect call failed ('115.29.46.136', 3128)]
Something went wrong, switching proxy to http://119.36.77.219:9091
Exception: 400, message='Bad Request', url=URL('http://119.36.77.219:9091')
Something went wrong, switching proxy to http://121.196.100.243:3128
Exception: Cannot connect to host 121.196.100.243:3128 ssl:default [Connect call failed ('121.196.100.243', 3128)]
Something went wrong, switching proxy to http://114.115.181.74:8080
Scraped 炮神归来, id 26705968, time elapsed 95.93215584754944, 299 to go
Exception: 
Something went wrong, switching proxy to http://221.215.252.88:9999
Scraped 欲望出租房, id 26760814, time elapsed 160.8546216487

In [9]:
# check number of scraped files is correct
_, _, files = next(os.walk("./data/detail"))
len(files)

4289