In [1]:
!pip install fake_useragent



In [2]:
!pip install peewee



In [3]:
import requests
import re
import pandas as pd
import time
import sys
from tqdm import tqdm
from fake_useragent import UserAgent
import json
from peewee import *
from datetime import datetime
import sys
import os

In [4]:
database = SqliteDatabase('2ch.db')

class BaseModel(Model):
    class Meta:
        database = database

class Board(BaseModel):
    key = CharField(primary_key=True)
        
class Icon(BaseModel):
    name = CharField()
    num = IntegerField(primary_key=True)
    board = ForeignKeyField(Board, backref='icons')
    
class Thread(BaseModel):
    num=IntegerField(primary_key=True)
    subject=CharField()
    board = ForeignKeyField(Board, backref='threads')
    
class Post(BaseModel):
    num=IntegerField(primary_key=True)
    name=CharField()
    comment=CharField()
    number=IntegerField()
    timestamp=TimestampField()
    thread=ForeignKeyField(Thread, backref='posts')

class File(BaseModel):
    fullname=CharField()
    md5=CharField(primary_key=True)
    size=IntegerField()
    path=CharField()
    
class PostFileRelationship(BaseModel):
    post=ForeignKeyField(Post, backref='files')
    file=ForeignKeyField(File, backref='posts')
    
class Like(BaseModel):
    post=ForeignKeyField(Post, backref='likes')
    likes=IntegerField()
    dislikes=IntegerField()
    date=TimestampField()
    
database.connect()
database.create_tables([Board, Icon, Thread, Post, File, Like, PostFileRelationship])
try:
    os.makedirs('files')
except:
    pass

In [None]:
def save_board(board):
    with database.atomic():
        try:
            b = Board.create(
                key=board['Board']
            )
        except:
            b = Board.select().where(Board.key == board['Board'])[0]
        for icon in board['icons']:
            try:
                Icon.create(
                    name=icon['name'],
                    num=icon['num'],
                    board=b
                )
            except:
                pass
        for thread in board['threads']:
            time.sleep(0.3)
            try:
                t = Thread.create(
                    num=thread['num'],
                    subject=thread['subject'],
                    board = b
                )
            except:
                t = Thread.select().where(Thread.num == thread['num'])[0]
            thread_json = requests.get(f'https://2ch.hk/po/res/{t.num}.json', headers={'User-Agent': UserAgent().chrome})
            if thread_json.status_code == 200:
                for post in tqdm(thread_json.json()['threads'][0]['posts']):
                    try:
                        p = Post.create(
                            num=post['num'],
                            name=post['name'],
                            comment=post['comment'],
                            number=post['number'],
                            timestamp=post['timestamp'],
                            thread = t
                        )
                        for f in post['files']:
                            try:
                                md5 = f['md5'] if 'md5' in f else f['name']
                                file=File.create(
                                    fullname=f['fullname'],
                                    md5=md5,
                                    size=f['size'],
                                    path=f['thumbnail'] if 'thumbnail' in f else 'none'
                                )
                                content_path = f.get('thumbnail')
                                if content_path != None:
                                    url = 'https://2ch.hk' + content_path
                                    r = requests.get(url)
                                    open('files/' + md5 + '.jpg', 'wb').write(r.content)
                            except:
                                file=File.select().where(File.md5 == f['md5'])
                            PostFileRelationship.create(
                                post=p,
                                file=file
                            )
                    except:
                        p = Post.select().where(Post.num == post['num'])
                    if post['likes'] > 0 or post['dislikes'] > 0:
                        last_like = Like.select().where(Like.post==p).order_by(Like.date.desc()).limit(1)
                        shouldAddLike = True
                        if len(last_like) > 0 and last_like[0].likes == post['likes'] and last_like[0].dislikes == post['dislikes']:
                            shouldAddLike = False
                        if shouldAddLike:
                            Like.create(
                                post=p,
                                likes=post['likes'],
                                dislikes=post['dislikes']
                            )
                
thread_catalog_url = 'https://2ch.hk/po/catalog.json'
while True:
    print('start', datetime.today())
    catalog_json = requests.get(thread_catalog_url, headers={'User-Agent': UserAgent().chrome})
    for i in range(10):
        try:
            save_board(catalog_json.json())
            break
        except:
            print('retry', datetime.today())
            time.sleep(5)
    print('complete and waiting', datetime.today())
    time.sleep(5 * 60)


start 2020-07-22 07:17:59.570105


100%|██████████| 461/461 [00:00<00:00, 1984.58it/s]
100%|██████████| 475/475 [00:00<00:00, 1677.72it/s]
100%|██████████| 250/250 [00:00<00:00, 1363.80it/s]
100%|██████████| 151/151 [00:00<00:00, 1942.11it/s]
100%|██████████| 354/354 [00:00<00:00, 1616.20it/s]
100%|██████████| 301/301 [00:00<00:00, 2965.96it/s]
100%|██████████| 14/14 [00:00<00:00, 1500.07it/s]
100%|██████████| 67/67 [00:00<00:00, 1377.66it/s]
100%|██████████| 482/482 [00:00<00:00, 1666.45it/s]
100%|██████████| 16/16 [00:00<00:00, 301.68it/s]
100%|██████████| 163/163 [00:00<00:00, 1797.31it/s]
100%|██████████| 210/210 [00:00<00:00, 794.05it/s] 
100%|██████████| 127/127 [00:00<00:00, 1158.45it/s]
100%|██████████| 139/139 [00:00<00:00, 1973.45it/s]
100%|██████████| 20/20 [00:00<00:00, 1295.00it/s]
100%|██████████| 275/275 [00:00<00:00, 3094.75it/s]
100%|██████████| 170/170 [00:00<00:00, 1590.34it/s]
100%|██████████| 9/9 [00:00<00:00, 57.94it/s]
100%|██████████| 75/75 [00:00<00:00, 1684.52it/s]
100%|██████████| 14/14 [00:00

100%|██████████| 178/178 [00:00<00:00, 2007.15it/s]
100%|██████████| 4/4 [00:00<00:00, 1460.54it/s]
100%|██████████| 3/3 [00:00<00:00, 1029.87it/s]
100%|██████████| 522/522 [00:00<00:00, 2297.15it/s]
100%|██████████| 519/519 [00:00<00:00, 1788.85it/s]
100%|██████████| 72/72 [00:00<00:00, 2152.40it/s]
100%|██████████| 26/26 [00:00<00:00, 1594.21it/s]
100%|██████████| 12/12 [00:00<00:00, 1298.58it/s]
100%|██████████| 1/1 [00:00<00:00, 697.42it/s]
100%|██████████| 69/69 [00:00<00:00, 1835.40it/s]
100%|██████████| 1/1 [00:00<00:00, 588.67it/s]
100%|██████████| 8/8 [00:00<00:00, 1422.76it/s]


complete and waiting 2020-07-22 07:19:13.986202
