In [1]:
!pip install fake_useragent



In [None]:
!pip install peewee

In [7]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys
from tqdm import tqdm
from fake_useragent import UserAgent
import json
from peewee import *
import datetime
import sys
import os

In [8]:
database = SqliteDatabase('2ch.db')

class BaseModel(Model):
    class Meta:
        database = database

class Board(BaseModel):
    key = CharField(primary_key=True)
        
class Icon(BaseModel):
    name = CharField()
    num = IntegerField(primary_key=True)
    board = ForeignKeyField(Board, backref='icons')
    
class Thread(BaseModel):
    num=IntegerField(primary_key=True)
    subject=CharField()
    board = ForeignKeyField(Board, backref='threads')
    
class Post(BaseModel):
    num=IntegerField(primary_key=True)
    name=CharField()
    comment=CharField()
    number=IntegerField()
    timestamp=TimestampField()
    thread=ForeignKeyField(Thread, backref='posts')

class File(BaseModel):
    fullname=CharField()
    md5=CharField(primary_key=True)
    path=CharField()
    post=ForeignKeyField(Post, backref='files')
    
class Like(BaseModel):
    post=ForeignKeyField(Post, backref='likes')
    likes=IntegerField()
    dislikes=IntegerField()
    date=TimestampField()
    
database.connect()
database.create_tables([Board, Icon, Thread, Post, File, Like])

In [9]:
def save_board(board):
    with database.atomic():
        try:
            b = Board.create(
                key=board['Board']
            )
        except:
            b = Board.select().where(Board.key == board['Board'])[0]
        for icon in board['icons']:
            try:
                Icon.create(
                    name=icon['name'],
                    num=icon['num'],
                    board=b
                )
            except:
                pass
        for thread in board['threads']:
            time.sleep(0.3)
            try:
                t = Thread.create(
                    num=thread['num'],
                    subject=thread['subject'],
                    board = b
                )
            except:
                t = Thread.select().where(Thread.num == thread['num'])[0]
            thread_json = requests.get(f'https://2ch.hk/po/res/{t.num}.json', headers={'User-Agent': UserAgent().chrome})
            if thread_json.status_code == 200:
                for post in tqdm(thread_json.json()['threads'][0]['posts']):
                    try:
                        p = Post.create(
                            num=post['num'],
                            name=post['name'],
                            comment=post['comment'],
                            number=post['number'],
                            timestamp=post['timestamp'],
                            thread = t
                        )
                    except:
                        p = Post.select().where(Post.num == post['num'])
                    Like.create(
                        post=p,
                        likes=post['likes'],
                        dislikes=post['dislikes']
                    )
                    for f in post['files']:
                        try:
                            File.create(
                                fullname=f['fullname'],
                                md5=f['md5'],
                                path=f['path'],
                                post = p
                            )
                        except:
                            pass                    
                
thread_catalog_url = 'https://2ch.hk/po/catalog.json'
while True:
    catalog_json = requests.get(thread_catalog_url, headers={'User-Agent': UserAgent().chrome})
    save_board(catalog_json.json())
    time.sleep(5 * 60)


100%|██████████| 133/133 [00:00<00:00, 1805.13it/s]
100%|██████████| 48/48 [00:00<00:00, 1737.69it/s]
100%|██████████| 95/95 [00:00<00:00, 2146.74it/s]
100%|██████████| 20/20 [00:00<00:00, 1624.69it/s]
100%|██████████| 8/8 [00:00<00:00, 1411.69it/s]
100%|██████████| 30/30 [00:00<00:00, 1682.95it/s]
100%|██████████| 79/79 [00:00<00:00, 1900.96it/s]
100%|██████████| 85/85 [00:00<00:00, 1836.73it/s]
100%|██████████| 92/92 [00:00<00:00, 1971.79it/s]
100%|██████████| 2/2 [00:00<00:00, 912.50it/s]
100%|██████████| 418/418 [00:00<00:00, 2008.77it/s]
100%|██████████| 441/441 [00:00<00:00, 2224.50it/s]
100%|██████████| 452/452 [00:00<00:00, 2237.28it/s]
100%|██████████| 64/64 [00:00<00:00, 1683.15it/s]
100%|██████████| 91/91 [00:00<00:00, 1999.38it/s]
100%|██████████| 341/341 [00:00<00:00, 2120.74it/s]
100%|██████████| 11/11 [00:00<00:00, 1464.07it/s]
100%|██████████| 264/264 [00:00<00:00, 2182.65it/s]
100%|██████████| 21/21 [00:00<00:00, 1384.87it/s]
100%|██████████| 305/305 [00:00<00:00, 2008

100%|██████████| 2/2 [00:00<00:00, 571.28it/s]
100%|██████████| 4/4 [00:00<00:00, 1053.45it/s]
100%|██████████| 5/5 [00:00<00:00, 1687.98it/s]
100%|██████████| 39/39 [00:00<00:00, 1511.81it/s]
100%|██████████| 34/34 [00:00<00:00, 1615.75it/s]
100%|██████████| 6/6 [00:00<00:00, 1029.53it/s]
100%|██████████| 5/5 [00:00<00:00, 942.29it/s]
100%|██████████| 48/48 [00:00<00:00, 1782.93it/s]
100%|██████████| 16/16 [00:00<00:00, 1371.05it/s]
100%|██████████| 4/4 [00:00<00:00, 897.71it/s]
100%|██████████| 11/11 [00:00<00:00, 971.56it/s]
100%|██████████| 105/105 [00:00<00:00, 1392.09it/s]
100%|██████████| 149/149 [00:00<00:00, 686.35it/s]
100%|██████████| 97/97 [00:00<00:00, 1297.07it/s]
100%|██████████| 262/262 [00:00<00:00, 1406.83it/s]
100%|██████████| 24/24 [00:00<00:00, 1242.37it/s]
100%|██████████| 51/51 [00:00<00:00, 1407.35it/s]
100%|██████████| 15/15 [00:00<00:00, 1113.18it/s]
100%|██████████| 93/93 [00:00<00:00, 1453.67it/s]
100%|██████████| 33/33 [00:00<00:00, 1250.94it/s]
100%|██████

100%|██████████| 36/36 [00:00<00:00, 1283.70it/s]
100%|██████████| 8/8 [00:00<00:00, 977.75it/s]
100%|██████████| 3/3 [00:00<00:00, 945.16it/s]
100%|██████████| 8/8 [00:00<00:00, 1023.94it/s]
100%|██████████| 10/10 [00:00<00:00, 984.14it/s]
100%|██████████| 55/55 [00:00<00:00, 1307.62it/s]
100%|██████████| 22/22 [00:00<00:00, 977.07it/s]
100%|██████████| 22/22 [00:00<00:00, 1151.17it/s]
100%|██████████| 118/118 [00:00<00:00, 1435.04it/s]
100%|██████████| 61/61 [00:00<00:00, 1333.19it/s]
100%|██████████| 2/2 [00:00<00:00, 822.01it/s]
100%|██████████| 4/4 [00:00<00:00, 809.59it/s]
100%|██████████| 5/5 [00:00<00:00, 844.88it/s]
100%|██████████| 39/39 [00:00<00:00, 965.01it/s]
100%|██████████| 34/34 [00:00<00:00, 1410.78it/s]
100%|██████████| 6/6 [00:00<00:00, 1071.80it/s]
100%|██████████| 5/5 [00:00<00:00, 718.50it/s]
100%|██████████| 48/48 [00:00<00:00, 1314.00it/s]
100%|██████████| 16/16 [00:00<00:00, 1073.26it/s]
100%|██████████| 4/4 [00:00<00:00, 767.91it/s]
100%|██████████| 11/11 [00

100%|██████████| 1/1 [00:00<00:00, 510.94it/s]
100%|██████████| 23/23 [00:00<00:00, 1041.66it/s]
100%|██████████| 7/7 [00:00<00:00, 923.97it/s]
100%|██████████| 43/43 [00:00<00:00, 1215.58it/s]
100%|██████████| 166/166 [00:00<00:00, 1528.62it/s]
100%|██████████| 41/41 [00:00<00:00, 1365.63it/s]
100%|██████████| 32/32 [00:00<00:00, 1195.82it/s]
100%|██████████| 336/336 [00:00<00:00, 1546.83it/s]
100%|██████████| 37/37 [00:00<00:00, 1241.56it/s]
100%|██████████| 21/21 [00:00<00:00, 1154.43it/s]
100%|██████████| 36/36 [00:00<00:00, 1258.13it/s]
100%|██████████| 8/8 [00:00<00:00, 1125.16it/s]
100%|██████████| 3/3 [00:00<00:00, 979.37it/s]
100%|██████████| 8/8 [00:00<00:00, 1039.22it/s]
100%|██████████| 10/10 [00:00<00:00, 1026.73it/s]
100%|██████████| 55/55 [00:00<00:00, 1358.56it/s]
100%|██████████| 22/22 [00:00<00:00, 1076.59it/s]
100%|██████████| 22/22 [00:00<00:00, 1218.57it/s]
100%|██████████| 118/118 [00:00<00:00, 1472.86it/s]
100%|██████████| 61/61 [00:00<00:00, 1135.26it/s]
100%|██

100%|██████████| 244/244 [00:00<00:00, 1460.89it/s]
100%|██████████| 29/29 [00:00<00:00, 1330.39it/s]
100%|██████████| 4/4 [00:00<00:00, 802.12it/s]
100%|██████████| 312/312 [00:00<00:00, 1478.50it/s]
100%|██████████| 62/62 [00:00<00:00, 1191.07it/s]
100%|██████████| 264/264 [00:00<00:00, 1340.54it/s]
100%|██████████| 89/89 [00:00<00:00, 1402.03it/s]
100%|██████████| 24/24 [00:00<00:00, 1254.04it/s]
100%|██████████| 334/334 [00:00<00:00, 1594.94it/s]
100%|██████████| 9/9 [00:00<00:00, 1199.02it/s]
100%|██████████| 16/16 [00:00<00:00, 1071.87it/s]
100%|██████████| 1/1 [00:00<00:00, 601.33it/s]
100%|██████████| 23/23 [00:00<00:00, 1065.76it/s]
100%|██████████| 7/7 [00:00<00:00, 944.66it/s]
100%|██████████| 43/43 [00:00<00:00, 1177.08it/s]
100%|██████████| 166/166 [00:00<00:00, 1474.15it/s]
100%|██████████| 41/41 [00:00<00:00, 1320.11it/s]
100%|██████████| 32/32 [00:00<00:00, 1313.46it/s]
100%|██████████| 336/336 [00:00<00:00, 1525.83it/s]
100%|██████████| 37/37 [00:00<00:00, 1507.31it/s]

100%|██████████| 17/17 [00:00<00:00, 1124.12it/s]
100%|██████████| 17/17 [00:00<00:00, 1112.77it/s]
100%|██████████| 168/168 [00:00<00:00, 1439.82it/s]
100%|██████████| 63/63 [00:00<00:00, 1218.80it/s]
100%|██████████| 10/10 [00:00<00:00, 897.43it/s]
100%|██████████| 88/88 [00:00<00:00, 1252.71it/s]
100%|██████████| 174/174 [00:00<00:00, 1604.89it/s]
100%|██████████| 14/14 [00:00<00:00, 1168.33it/s]
100%|██████████| 497/497 [00:00<00:00, 1523.27it/s]
100%|██████████| 44/44 [00:00<00:00, 1366.24it/s]
100%|██████████| 244/244 [00:00<00:00, 1461.54it/s]
100%|██████████| 29/29 [00:00<00:00, 1303.32it/s]
100%|██████████| 4/4 [00:00<00:00, 884.36it/s]
100%|██████████| 312/312 [00:00<00:00, 1466.35it/s]
100%|██████████| 62/62 [00:00<00:00, 1370.07it/s]
100%|██████████| 264/264 [00:00<00:00, 1599.47it/s]
100%|██████████| 89/89 [00:00<00:00, 1169.90it/s]
100%|██████████| 24/24 [00:00<00:00, 1345.21it/s]
100%|██████████| 334/334 [00:00<00:00, 1552.90it/s]
100%|██████████| 9/9 [00:00<00:00, 1179.

100%|██████████| 13/13 [00:00<00:00, 1221.49it/s]
100%|██████████| 15/15 [00:00<00:00, 1234.05it/s]
100%|██████████| 64/64 [00:00<00:00, 1291.81it/s]
100%|██████████| 18/18 [00:00<00:00, 1241.71it/s]
100%|██████████| 25/25 [00:00<00:00, 1205.81it/s]
100%|██████████| 88/88 [00:00<00:00, 1409.47it/s]
100%|██████████| 116/116 [00:00<00:00, 1455.02it/s]
100%|██████████| 10/10 [00:00<00:00, 882.84it/s]
100%|██████████| 267/267 [00:00<00:00, 1462.79it/s]
100%|██████████| 3/3 [00:00<00:00, 860.61it/s]
100%|██████████| 11/11 [00:00<00:00, 1068.39it/s]
100%|██████████| 17/17 [00:00<00:00, 1263.19it/s]
100%|██████████| 17/17 [00:00<00:00, 1025.16it/s]
100%|██████████| 168/168 [00:00<00:00, 1484.10it/s]
100%|██████████| 63/63 [00:00<00:00, 1282.08it/s]
100%|██████████| 10/10 [00:00<00:00, 898.46it/s]
100%|██████████| 88/88 [00:00<00:00, 1290.36it/s]
100%|██████████| 174/174 [00:00<00:00, 1510.33it/s]
100%|██████████| 14/14 [00:00<00:00, 1009.06it/s]
100%|██████████| 497/497 [00:00<00:00, 1529.71i

100%|██████████| 8/8 [00:00<00:00, 940.27it/s]
100%|██████████| 15/15 [00:00<00:00, 1135.66it/s]
100%|██████████| 6/6 [00:00<00:00, 1314.35it/s]
100%|██████████| 17/17 [00:00<00:00, 1019.51it/s]
100%|██████████| 32/32 [00:00<00:00, 1043.37it/s]
100%|██████████| 3/3 [00:00<00:00, 877.22it/s]
100%|██████████| 27/27 [00:00<00:00, 1303.85it/s]
100%|██████████| 5/5 [00:00<00:00, 949.11it/s]
100%|██████████| 13/13 [00:00<00:00, 1147.00it/s]
100%|██████████| 508/508 [00:00<00:00, 1430.50it/s]
100%|██████████| 16/16 [00:00<00:00, 1261.33it/s]
100%|██████████| 1/1 [00:00<00:00, 497.54it/s]
100%|██████████| 35/35 [00:00<00:00, 1341.77it/s]
100%|██████████| 13/13 [00:00<00:00, 1130.17it/s]
100%|██████████| 15/15 [00:00<00:00, 1192.53it/s]
100%|██████████| 64/64 [00:00<00:00, 1237.02it/s]
100%|██████████| 18/18 [00:00<00:00, 1146.77it/s]
100%|██████████| 25/25 [00:00<00:00, 1209.30it/s]
100%|██████████| 88/88 [00:00<00:00, 1438.50it/s]
100%|██████████| 116/116 [00:00<00:00, 1361.55it/s]
100%|█████

100%|██████████| 35/35 [00:00<00:00, 1125.63it/s]
100%|██████████| 56/56 [00:00<00:00, 1299.19it/s]
100%|██████████| 29/29 [00:00<00:00, 1174.14it/s]
100%|██████████| 116/116 [00:00<00:00, 1288.49it/s]
100%|██████████| 2/2 [00:00<00:00, 704.75it/s]
100%|██████████| 97/97 [00:00<00:00, 1346.34it/s]
100%|██████████| 73/73 [00:00<00:00, 1336.51it/s]
100%|██████████| 17/17 [00:00<00:00, 1072.21it/s]
100%|██████████| 31/31 [00:00<00:00, 1155.73it/s]
100%|██████████| 21/21 [00:00<00:00, 1167.96it/s]
100%|██████████| 16/16 [00:00<00:00, 1197.97it/s]
100%|██████████| 8/8 [00:00<00:00, 1035.73it/s]
100%|██████████| 15/15 [00:00<00:00, 1174.74it/s]
100%|██████████| 6/6 [00:00<00:00, 877.32it/s]
100%|██████████| 17/17 [00:00<00:00, 886.12it/s]
100%|██████████| 32/32 [00:00<00:00, 1092.49it/s]
100%|██████████| 3/3 [00:00<00:00, 757.05it/s]
100%|██████████| 27/27 [00:00<00:00, 1074.80it/s]
100%|██████████| 5/5 [00:00<00:00, 877.32it/s]
100%|██████████| 13/13 [00:00<00:00, 1402.71it/s]
100%|████████

100%|██████████| 133/133 [00:00<00:00, 1295.08it/s]
100%|██████████| 517/517 [00:00<00:00, 1568.87it/s]
100%|██████████| 18/18 [00:00<00:00, 1122.69it/s]
100%|██████████| 30/30 [00:00<00:00, 950.78it/s]
100%|██████████| 154/154 [00:00<00:00, 1238.75it/s]
100%|██████████| 29/29 [00:00<00:00, 1179.04it/s]
100%|██████████| 6/6 [00:00<00:00, 759.31it/s]
100%|██████████| 5/5 [00:00<00:00, 821.96it/s]
100%|██████████| 286/286 [00:00<00:00, 1567.75it/s]
100%|██████████| 24/24 [00:00<00:00, 1115.33it/s]
100%|██████████| 24/24 [00:00<00:00, 1132.28it/s]
100%|██████████| 35/35 [00:00<00:00, 1135.12it/s]
100%|██████████| 56/56 [00:00<00:00, 1336.51it/s]
100%|██████████| 29/29 [00:00<00:00, 1227.25it/s]
100%|██████████| 116/116 [00:00<00:00, 1237.78it/s]
100%|██████████| 2/2 [00:00<00:00, 726.48it/s]
100%|██████████| 97/97 [00:00<00:00, 1327.59it/s]
100%|██████████| 73/73 [00:00<00:00, 1330.37it/s]
100%|██████████| 17/17 [00:00<00:00, 1066.22it/s]
100%|██████████| 31/31 [00:00<00:00, 1295.23it/s]


100%|██████████| 12/12 [00:00<00:00, 779.86it/s]
100%|██████████| 4/4 [00:00<00:00, 695.92it/s]
100%|██████████| 273/273 [00:00<00:00, 1502.64it/s]
100%|██████████| 48/48 [00:00<00:00, 1031.53it/s]
100%|██████████| 4/4 [00:00<00:00, 735.71it/s]
100%|██████████| 3/3 [00:00<00:00, 938.60it/s]
100%|██████████| 334/334 [00:00<00:00, 1093.51it/s]
100%|██████████| 13/13 [00:00<00:00, 1043.68it/s]
100%|██████████| 10/10 [00:00<00:00, 1051.65it/s]
100%|██████████| 91/91 [00:00<00:00, 1363.32it/s]
100%|██████████| 73/73 [00:00<00:00, 1412.10it/s]
100%|██████████| 133/133 [00:00<00:00, 1500.65it/s]
100%|██████████| 517/517 [00:00<00:00, 1516.41it/s]
100%|██████████| 18/18 [00:00<00:00, 1302.92it/s]
100%|██████████| 30/30 [00:00<00:00, 1339.71it/s]
100%|██████████| 154/154 [00:00<00:00, 1411.37it/s]
100%|██████████| 29/29 [00:00<00:00, 1334.56it/s]
100%|██████████| 6/6 [00:00<00:00, 729.66it/s]
100%|██████████| 5/5 [00:00<00:00, 1020.81it/s]
100%|██████████| 286/286 [00:00<00:00, 1551.55it/s]
100

100%|██████████| 61/61 [00:00<00:00, 1481.95it/s]
100%|██████████| 309/309 [00:00<00:00, 1612.66it/s]
100%|██████████| 476/476 [00:00<00:00, 1617.35it/s]
100%|██████████| 42/42 [00:00<00:00, 1226.85it/s]
100%|██████████| 110/110 [00:00<00:00, 1366.10it/s]
100%|██████████| 66/66 [00:00<00:00, 1446.19it/s]
100%|██████████| 14/14 [00:00<00:00, 1170.96it/s]
100%|██████████| 3/3 [00:00<00:00, 1016.31it/s]
100%|██████████| 1/1 [00:00<00:00, 365.36it/s]
100%|██████████| 296/296 [00:00<00:00, 1139.12it/s]
100%|██████████| 88/88 [00:00<00:00, 1322.44it/s]
100%|██████████| 15/15 [00:00<00:00, 1090.66it/s]
100%|██████████| 12/12 [00:00<00:00, 810.92it/s]
100%|██████████| 4/4 [00:00<00:00, 766.01it/s]
100%|██████████| 273/273 [00:00<00:00, 1515.57it/s]
100%|██████████| 48/48 [00:00<00:00, 1220.51it/s]
100%|██████████| 4/4 [00:00<00:00, 772.40it/s]
100%|██████████| 3/3 [00:00<00:00, 860.78it/s]
100%|██████████| 334/334 [00:00<00:00, 1096.06it/s]
100%|██████████| 13/13 [00:00<00:00, 1060.44it/s]
100

100%|██████████| 70/70 [00:00<00:00, 1330.92it/s]
100%|██████████| 56/56 [00:00<00:00, 1264.57it/s]
100%|██████████| 27/27 [00:00<00:00, 1081.74it/s]
100%|██████████| 267/267 [00:00<00:00, 1464.95it/s]
100%|██████████| 57/57 [00:00<00:00, 1264.03it/s]
100%|██████████| 242/242 [00:00<00:00, 1233.71it/s]
100%|██████████| 5/5 [00:00<00:00, 961.11it/s]
100%|██████████| 5/5 [00:00<00:00, 689.29it/s]
100%|██████████| 20/20 [00:00<00:00, 1083.59it/s]
100%|██████████| 113/113 [00:00<00:00, 1375.73it/s]
100%|██████████| 262/262 [00:00<00:00, 1164.98it/s]
100%|██████████| 2/2 [00:00<00:00, 668.84it/s]
100%|██████████| 61/61 [00:00<00:00, 1283.38it/s]
100%|██████████| 309/309 [00:00<00:00, 1620.21it/s]
100%|██████████| 476/476 [00:00<00:00, 1576.64it/s]
100%|██████████| 42/42 [00:00<00:00, 1201.82it/s]
100%|██████████| 110/110 [00:00<00:00, 1289.86it/s]
100%|██████████| 66/66 [00:00<00:00, 1328.43it/s]
100%|██████████| 14/14 [00:00<00:00, 1079.00it/s]
100%|██████████| 3/3 [00:00<00:00, 678.73it/s

ChunkedEncodingError: ('Connection broken: IncompleteRead(5642 bytes read, 4598 more expected)', IncompleteRead(5642 bytes read, 4598 more expected))