In [1]:
!pip install fake_useragent



In [2]:
!pip install peewee



In [48]:
import requests
import re
import pandas as pd
import time
import sys
from tqdm import tqdm
from fake_useragent import UserAgent
import json
from peewee import *
from datetime import datetime
import sys
import os

In [41]:
database = SqliteDatabase('2ch.db')

class BaseModel(Model):
    class Meta:
        database = database

class Board(BaseModel):
    key = CharField(primary_key=True)
        
class Icon(BaseModel):
    name = CharField()
    num = IntegerField(primary_key=True)
    board = ForeignKeyField(Board, backref='icons')
    
class Thread(BaseModel):
    num=IntegerField(primary_key=True)
    subject=CharField()
    board = ForeignKeyField(Board, backref='threads')
    
class Post(BaseModel):
    num=IntegerField(primary_key=True)
    name=CharField()
    comment=CharField()
    number=IntegerField()
    timestamp=TimestampField()
    thread=ForeignKeyField(Thread, backref='posts')

class File(BaseModel):
    fullname=CharField()
    md5=CharField(primary_key=True)
    size=IntegerField()
    path=CharField()
    
class PostFileRelationship(BaseModel):
    post=ForeignKeyField(Post, backref='files')
    file=ForeignKeyField(File, backref='posts')
    
class Like(BaseModel):
    post=ForeignKeyField(Post, backref='likes')
    likes=IntegerField()
    dislikes=IntegerField()
    date=TimestampField()
    
database.connect()
database.create_tables([Board, Icon, Thread, Post, File, Like, PostFileRelationship])
os.makedirs('files')

In [None]:
def save_board(board):
    with database.atomic():
        try:
            b = Board.create(
                key=board['Board']
            )
        except:
            b = Board.select().where(Board.key == board['Board'])[0]
        for icon in board['icons']:
            try:
                Icon.create(
                    name=icon['name'],
                    num=icon['num'],
                    board=b
                )
            except:
                pass
        for thread in board['threads']:
            time.sleep(0.3)
            try:
                t = Thread.create(
                    num=thread['num'],
                    subject=thread['subject'],
                    board = b
                )
            except:
                t = Thread.select().where(Thread.num == thread['num'])[0]
            thread_json = requests.get(f'https://2ch.hk/po/res/{t.num}.json', headers={'User-Agent': UserAgent().chrome})
            if thread_json.status_code == 200:
                for post in tqdm(thread_json.json()['threads'][0]['posts']):
                    try:
                        p = Post.create(
                            num=post['num'],
                            name=post['name'],
                            comment=post['comment'],
                            number=post['number'],
                            timestamp=post['timestamp'],
                            thread = t
                        )
                        for f in post['files']:
                            try:
                                md5 = f['md5'] if 'md5' in f else f['name']
                                file=File.create(
                                    fullname=f['fullname'],
                                    md5=md5,
                                    size=f['size'],
                                    path=f['thumbnail'] if 'thumbnail' in f else 'none'
                                )
                                content_path = f.get('thumbnail')
                                if content_path != None:
                                    url = 'https://2ch.hk' + content_path
                                    r = requests.get(url)
                                    open('files/' + md5 + '.jpg', 'wb').write(r.content)
                            except:
                                file=File.select().where(File.md5 == f['md5'])
                            PostFileRelationship.create(
                                post=p,
                                file=file
                            )
                    except:
                        p = Post.select().where(Post.num == post['num'])
                    Like.create(
                        post=p,
                        likes=post['likes'],
                        dislikes=post['dislikes']
                    )

                
thread_catalog_url = 'https://2ch.hk/po/catalog.json'
while True:
    print('start', datetime.today())
    catalog_json = requests.get(thread_catalog_url, headers={'User-Agent': UserAgent().chrome})
    for i in range(10):
        try:
            save_board(catalog_json.json())
            break
        except:
            print('retry', datetime.today())
            time.sleep(5)
    print('complete and waiting', datetime.today())
    time.sleep(5 * 60)


start 2020-07-18 11:13:24.536035


100%|██████████| 386/386 [00:00<00:00, 766.44it/s] 
100%|██████████| 188/188 [00:00<00:00, 1547.29it/s]
100%|██████████| 46/46 [00:00<00:00, 1502.42it/s]
100%|██████████| 117/117 [00:00<00:00, 1493.76it/s]
100%|██████████| 227/227 [00:00<00:00, 484.44it/s] 
100%|██████████| 29/29 [00:00<00:00, 191.29it/s]
100%|██████████| 414/414 [00:00<00:00, 1676.66it/s]
100%|██████████| 35/35 [00:00<00:00, 1337.82it/s]
100%|██████████| 373/373 [00:00<00:00, 1101.06it/s]
100%|██████████| 95/95 [00:00<00:00, 847.34it/s]
100%|██████████| 3/3 [00:00<00:00, 25.69it/s]
100%|██████████| 77/77 [00:00<00:00, 754.13it/s]
100%|██████████| 32/32 [00:00<00:00, 1444.38it/s]
100%|██████████| 3/3 [00:00<00:00, 833.31it/s]
100%|██████████| 6/6 [00:00<00:00, 1438.21it/s]
100%|██████████| 282/282 [00:00<00:00, 1684.57it/s]
100%|██████████| 11/11 [00:00<00:00, 1505.79it/s]
100%|██████████| 23/23 [00:00<00:00, 283.48it/s]
100%|██████████| 9/9 [00:00<00:00, 1246.49it/s]
100%|██████████| 87/87 [00:00<00:00, 1508.08it/s]
1

100%|██████████| 9/9 [00:00<00:00, 1238.27it/s]
100%|██████████| 34/34 [00:00<00:00, 1371.02it/s]
100%|██████████| 35/35 [00:00<00:00, 1376.12it/s]
100%|██████████| 11/11 [00:00<00:00, 1202.84it/s]
100%|██████████| 40/40 [00:00<00:00, 1412.77it/s]
100%|██████████| 119/119 [00:00<00:00, 1582.49it/s]
100%|██████████| 12/12 [00:00<00:00, 1226.23it/s]
100%|██████████| 16/16 [00:00<00:00, 1287.41it/s]
100%|██████████| 95/95 [00:00<00:00, 1530.04it/s]
100%|██████████| 108/108 [00:00<00:00, 1619.43it/s]
100%|██████████| 102/102 [00:00<00:00, 1499.69it/s]
100%|██████████| 62/62 [00:00<00:00, 1473.48it/s]


complete and waiting 2020-07-18 11:14:49.119581
