In [1]:
!pip install fake_useragent



In [2]:
!pip install peewee



In [3]:
import requests
import re
import pandas as pd
import time
import sys
from tqdm import tqdm
from fake_useragent import UserAgent
import json
from peewee import *
from datetime import datetime
import sys
import os
from bs4 import BeautifulSoup

In [4]:
database = SqliteDatabase('2ch.db')

class BaseModel(Model):
    class Meta:
        database = database

class Board(BaseModel):
    key = CharField(primary_key=True)
        
class Icon(BaseModel):
    name = CharField(primary_key=True)
    board = ForeignKeyField(Board, backref='icons')
    
class Thread(BaseModel):
    num=IntegerField(primary_key=True)
    subject=CharField()
    board = ForeignKeyField(Board, backref='threads')
    
class Post(BaseModel):
    num=IntegerField(primary_key=True)
    name=CharField()
    comment=CharField()
    number=IntegerField()
    timestamp=TimestampField()
    icon=ForeignKeyField(Icon, null=True)
    thread=ForeignKeyField(Thread, backref='posts')

class File(BaseModel):
    fullname=CharField()
    md5=CharField(primary_key=True)
    size=IntegerField()
    path=CharField()
    
class PostFileRelationship(BaseModel):
    post=ForeignKeyField(Post, backref='files')
    file=ForeignKeyField(File, backref='posts')
    
class Like(BaseModel):
    post=ForeignKeyField(Post, backref='likes')
    likes=IntegerField()
    dislikes=IntegerField()
    date=TimestampField()
    
database.connect()
database.create_tables([Board, Icon, Thread, Post, File, Like, PostFileRelationship])
try:
    os.makedirs('files')
except:
    pass

In [5]:
def save_board(board):
    with database.atomic():
        try:
            b = Board.create(
                key=board['Board']
            )
        except:
            b = Board.select().where(Board.key == board['Board'])[0]
        for icon in board['icons']:
            try:
                Icon.create(
                    name=icon['name'],
                    num=icon['num'],
                    board=b
                )
            except:
                pass
        for thread in board['threads']:
            time.sleep(0.3)
            try:
                t = Thread.create(
                    num=thread['num'],
                    subject=thread['subject'],
                    board = b
                )
            except:
                t = Thread.select().where(Thread.num == thread['num'])[0]
            thread_json = requests.get(f'https://2ch.hk/po/res/{t.num}.json', headers={'User-Agent': UserAgent().chrome})
            if thread_json.status_code == 200:
                for post in tqdm(thread_json.json()['threads'][0]['posts']):
                    try:
                        p = Post.create(
                            num=post['num'],
                            name=post['name'],
                            comment=post['comment'],
                            number=post['number'],
                            timestamp=post['timestamp'],
                            icon=((BeautifulSoup(post['icon'])).html.body.img)['title'] if 'icon' in post else None,
                            thread = t
                        )
                        for f in post['files']:
                            try:
                                md5 = f['md5'] if 'md5' in f else f['name']
                                file=File.create(
                                    fullname=f['fullname'],
                                    md5=md5,
                                    size=f['size'],
                                    path=f['thumbnail'] if 'thumbnail' in f else 'none'
                                )
                                content_path = f.get('thumbnail')
                                if content_path != None:
                                    url = 'https://2ch.hk' + content_path
                                    r = requests.get(url)
                                    open('files/' + md5 + '.jpg', 'wb').write(r.content)
                            except:
                                file=File.select().where(File.md5 == f['md5'])
                            PostFileRelationship.create(
                                post=p,
                                file=file
                            )
                    except:
                        p = Post.select().where(Post.num == post['num'])
                    if post['likes'] > 0 or post['dislikes'] > 0:
                        last_like = Like.select().where(Like.post==p).order_by(Like.date.desc()).limit(1)
                        shouldAddLike = True
                        if len(last_like) > 0 and last_like[0].likes == post['likes'] and last_like[0].dislikes == post['dislikes']:
                            shouldAddLike = False
                        if shouldAddLike:
                            Like.create(
                                post=p,
                                likes=post['likes'],
                                dislikes=post['dislikes']
                            )
                
thread_catalog_url = 'https://2ch.hk/po/catalog.json'
while True:
    print('start', datetime.today())
    catalog_json = requests.get(thread_catalog_url, headers={'User-Agent': UserAgent().chrome})
    for i in range(10):
        try:
            save_board(catalog_json.json())
            break
        except Exception as e:
            print('retry', datetime.today(), 'after', e)
            time.sleep(5)
    print('complete and waiting', datetime.today())
    time.sleep(5 * 60)


start 2020-08-01 16:11:08.745575


100%|██████████| 139/139 [00:01<00:00, 130.17it/s]
100%|██████████| 12/12 [00:00<00:00, 49.60it/s]
100%|██████████| 94/94 [00:01<00:00, 91.50it/s]
100%|██████████| 1/1 [00:00<00:00, 10.11it/s]
100%|██████████| 257/257 [00:05<00:00, 47.78it/s]
100%|██████████| 432/432 [00:06<00:00, 63.21it/s]
100%|██████████| 3/3 [00:00<00:00, 51.98it/s]
100%|██████████| 2/2 [00:00<00:00, 22.08it/s]
100%|██████████| 32/32 [00:00<00:00, 46.52it/s]
100%|██████████| 18/18 [00:00<00:00, 65.17it/s]
100%|██████████| 5/5 [00:00<00:00, 11.35it/s]
100%|██████████| 216/216 [00:04<00:00, 44.29it/s]
100%|██████████| 41/41 [00:00<00:00, 88.75it/s] 
100%|██████████| 30/30 [00:00<00:00, 362.65it/s]
100%|██████████| 71/71 [00:00<00:00, 187.96it/s]
100%|██████████| 259/259 [00:01<00:00, 138.35it/s]
100%|██████████| 25/25 [00:00<00:00, 80.81it/s]
100%|██████████| 107/107 [00:00<00:00, 114.24it/s]
100%|██████████| 2/2 [00:00<00:00, 33.28it/s]
100%|██████████| 473/473 [00:21<00:00, 21.94it/s]
100%|██████████| 464/464 [00:0

100%|██████████| 28/28 [00:00<00:00, 246.05it/s]
100%|██████████| 12/12 [00:00<00:00, 166.78it/s]
100%|██████████| 64/64 [00:01<00:00, 51.00it/s]
100%|██████████| 109/109 [00:01<00:00, 74.77it/s]
100%|██████████| 134/134 [00:01<00:00, 102.73it/s]


complete and waiting 2020-08-01 16:17:13.194789


KeyboardInterrupt: 