# Data scraping & dataset creation

CS685 Spring 2022 <br />
Feb. 19, 2022<br />
Hongyu Tu <br />

In [1]:
import re
import time
import pickle
import asyncio
import datetime
import requests
import numpy as np
import pandas as pd 
import nest_asyncio
from utils import *
from tqdm import tqdm
from os import listdir
from collections import Counter
from bilibili_api import video, sync

nest_asyncio.apply()

In [2]:
cat_dic = init_category_dic()
tid_lst = np.array(list(cat_dic.keys()))
print('Category list initialized successfully\nWe have {} categories'.format(len(tid_lst)))

Category list initialized successfully
We have 126 categories


In [3]:
for curr_id in tqdm(tid_lst):
    danmu_lst, comment_lst = process_category(curr_id)
    for idx, name in enumerate(['danmu', 'comment']):
        fname = 'pickle/{}/part{}.pkl'.format(name, curr_id)
        with open(fname, 'wb') as f:
            pickle.dump(danmu_lst if idx == 0 else comment_lst, f)
        f.close()

100%|██████████████████████████████████████████████████████████████████████████████| 126/126 [2:34:30<00:00, 73.58s/it]


In [3]:
data = [[], []]
for idx, name in enumerate(['danmu', 'comment']):
    for i in listdir('pickle/{}'.format(name)):
        if 'pkl' in i:
            data[idx] = data[idx] + pickle.load(open('pickle/{}/{}'.format(name, i), 'rb'))

In [4]:
df = list_to_csv(data[0], 'danmu')
df2 = list_to_csv(data[1], 'comment')

In [5]:
df[-20:]

Unnamed: 0,danmu,Frequency,BVID,Source Video Title,Category ID,Channel ID,Source Video View Count
69352,您,670,BV1gL4y1g7EP,Phigros单指收歌世界第一DeadSoulinlv14单指AllPerfect,136,DL之星,194299
874590,好耶,673,BV1BL4y137Mo,咩栗x呜米哈索尔原创曲,28,呜米,335708
640083,落魄了,695,BV1Fa411C7FK,全上海最便宜58元酱大骨自助直呼过瘾,212,哇塞几张,714439
73428,新年快乐,705,BV1wb4y1t7Yo,时代少年团光环中的少年辞旧,137,TF家族,3586061
589895,助力每一个梦想,717,BV1Dm4y197on,吃鱼不见鱼如何靠整活把一条鱼卖到1600,212,哇塞几张,664217
287651,好评如潮,775,BV1vL411K7UN,STN快报第六季21为了拯救工作室白金竟说自己可以出来卖,17,STN工作室,1431281
231629,注入灵魂,798,BV1mu411X7TB,才浅手工猛男还原魔法少女水晶魔法棒什么是魔法看完你就知道了,161,才疏学浅的才浅,767224
170892,念头通达,810,BV1r44y1J7JE,独家凡人修仙传之魔道争锋篇第17集总第38集,153,哔哩哔哩国创,3185205
50053,新婚快乐,1079,BV1Ui4y1173z,结婚就该这么来上才艺吧哈哈哈哈舞台差点干塌,198,OB山下一散人,3177494
350126,狼人归来,1352,BV14S4y1C7im,猎人情报爷青回时空猎人3狼叔归来霸气裂爪绝杀,172,时空猎人3,789324


In [6]:
df2[-20:]

Unnamed: 0,comment,Frequency,BVID,Source Video Title,Category ID,Channel ID,Source Video View Count
137397,中中中,66,BV1XR4y1L7mS,抽奖预告史无前例总价值3万ROG纯白全家桶顶级游戏主机仅限B站,95,动力评测,413203
88282,已三连求资料,67,BV13Z4y1k7Ci,清华教授68小时讲完的Java教程整整300集现在拿出来分享给大家从入门到精通手把手教学学完...,231,神兽保护站,25723
87169,关注翻滚吧阿辉点赞助力每一个梦想,73,BV11S4y1r7Uk,疯狂的iPhone设置新iPhone13一定要开启的13大优先设置你都开启了吗iPhone使...,230,翻滚吧阿辉,66190
47358,已经三连求资料,73,BV1US4y1F7R9,申论大作文不会写试试背几篇这个轻松上75,208,-迪宝儿,154230
47392,领到了,74,BV1US4y1F7R9,申论大作文不会写试试背几篇这个轻松上75,208,-迪宝儿,154230
137393,1,103,BV1XR4y1L7mS,抽奖预告史无前例总价值3万ROG纯白全家桶顶级游戏主机仅限B站,95,动力评测,413203
47327,已三连关注求资料,111,BV1US4y1F7R9,申论大作文不会写试试背几篇这个轻松上75,208,-迪宝儿,154230
47405,已三连求资料谢谢,131,BV1US4y1F7R9,申论大作文不会写试试背几篇这个轻松上75,208,-迪宝儿,154230
47369,已收到,141,BV1US4y1F7R9,申论大作文不会写试试背几篇这个轻松上75,208,-迪宝儿,154230
87424,已三连求分享,155,BV1cm4y1o75z,华为大佬72小时讲完的python2022最新版学完即可就业拿走不谢学不会我退出IT界,231,图灵学院教程,102749
