# 設定読み込み・クラス定義

In [2]:
from ruamel import yaml

# 設定読み込み(config.ymlが存在しなければconfig.yml.sampleをコピーする)
sampleConfigPath = "config.yml.sample"
configPath = "config.yml"
if not os.path.exists(configPath):
    shutil.copyfile(sampleConfigPath, configPath)
    print(f"{configPath} was created from {sampleConfigPath}")
with open(configPath, 'r') as f:
    config = yaml.safe_load(f)

# クラス定義
import mysql.connector as my

class DB:
    def __init__(self, dbConfig, init=True):
        self.con = my.connect(host=dbConfig["host"], database=dbConfig["name"],
                              user=dbConfig["user"], password=dbConfig["pass"])
        self.cur = self.con.cursor()
        if init:
            self.cur.execute("delete from contents;")
            self.cur.execute("ALTER TABLE contents AUTO_INCREMENT=1;")
            
    def __enter__(self):
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        self.cur.close()
        self.con.commit()
        self.con.close()
    
    def insert(self, org, title, description, w, h, indexTexts, type, url, embeddable, filename,
               tag, thumbUrl, viewCount, likeCount, dislikeCount, favoriteCount, commentCount,
               publishedAt):
        self.cur.execute(
            "insert into contents (org, title, description, width, height, indexText, type, url, "
            "embeddable, filename, tag, thumbUrl, "
            "viewCount, likeCount, dislikeCount, favoriteCount, commentCount, "
            "publishedAt) "
            "values (%s, %s, %s, %s, %s, %s, %s, %s, "
            "%s, %s, %s, %s, "
            "%s, %s, %s, %s, %s, "
            "%s)",
            (org, title, description, w, h, indexTexts, type, url, 
             embeddable, filename, tag, thumbUrl,
             viewCount, likeCount, dislikeCount, favoriteCount, commentCount, 
             publishedAt))
        
    def deleteByType(self, type):
        self.cur.execute(
            "delete from contents where type=%s",
            (type,))


    def deleteByTypeAndOrg(self, type, org):
        self.cur.execute(
            "delete from contents where type=%s and org=%s",
            (type, org))

import re

def stripSpaces(str):
    return re.sub(r'([ 　]+)|\n', ' ', str).strip()


class IEICEWebinarSummary:
    def __init__(self, summaryDiv):
        summaries = summaryDiv.find_all("tr")
        self.category = stripSpaces(summaries[0].get_text())

        self.eventName = None
        self.publishedAt = None
        self.title = None
        self.author = None
        self.abstract = None
        self.keywords = None
        def setEventName(t): self.eventName = t
        def setPublishedAt(t): self.publishedAt = t
        def setTitle(t): self.title = t
        def setAuthor(t): self.author = t
        def setAbstract(t): self.abstract = t
        def setKeywords(t): self.keywords = t
        idxs = {
            "イベント名": setEventName,
            "発表年月日": setPublishedAt,
            "受賞年月日": setPublishedAt,
            "タイトル": setTitle,
            "著者": setAuthor,
            "抄録": setAbstract,
            "キーワード": setKeywords}
        
        n = len(summaries)
        for i in range(1, n):
            name = stripSpaces(summaries[i].get_text())
            if i + 1 < n and name in idxs:
                idxs[name](stripSpaces(summaries[i + 1].get_text()))
            i += 1


# 電子情報通信学会ウェビナーサイトをクロールする

In [None]:
!wget -q \
  -R doc,zip,jpg,gif,ico,png,JPG,GIF,PNG,jpeg,JPEG,pdf,PDF,svg,SVG,css,CSS,js,JS,mp4 \
  -r https://webinar.ieice.org/
!echo `ls -laR | grep -c '^-'` files downloaded

# 電子情報通信学会ウェビナーサイトからクロールしたデータをDBに取り込む

In [100]:
from bs4 import BeautifulSoup
from glob import iglob
import os
import re
import mysql.connector as my

with open('duplicatedTitleOnIEICEWebinar.txt', 'r') as f:
    ignoreTitleKeywords = {l.strip() for l in f.readlines() if l[0] != "#"}
print(f"ignore: {str(ignoreTitleKeywords)} on IEICE webinar.")


with DB(dbConfig=config["db"], init=False) as db:
    db.deleteByType("ieicewebinar")
    c = 0
    for filename in iglob("./webinar.ieice.org/**/*", recursive=True):
        if os.path.isdir(filename): continue
        soup = BeautifulSoup(open(filename, encoding="utf8", errors='ignore'), 'html.parser')
        iframe = soup.iframe
        if not iframe: continue
        title = soup.title.get_text()
        found = False
        for kw in ignoreTitleKeywords:
            if kw in title and "IEICE Ondemand Webinar - " in title:
                found = True
                break
        if found: continue
        summary = soup.find(id='summary')
        w = iframe["width"]
        h = iframe["height"]
        id = iframe["src"].split("/")[-1:][0]
        #src="https://player.vimeo.com/video/489189921" 
        thumb = "nothumb.png"  #f"https://i.vimeocdn.com/video/{id}.webp?mw=900&mh=507"
        if summary:
            s = IEICEWebinarSummary(summary)
            desc = stripSpaces(summary.get_text())
            indexTexts = stripSpaces(title) + ' ' + desc
            db.insert('電子情報通信学会', title, desc, w, h, indexTexts,
                      "ieicewebinar", "https:/" + filename[1:], False,
                      filename, str(iframe), thumb, 0, 0, 0, 0, 0,
                      s.publishedAt)
            c += 1
    print(f"{c} entries inserted.")


ignore: {'情報ネットワークの周辺で画像と共に半世紀', '数理工学から見たICT', 'BigData, Social Miningを通した新たなサービスモデルの可能性', 'スーパーコンピュータ「富岳」の開発とコデザイン', 'IoT技術を活用した製造業におけるDXの取り組み', '移動無線通信技術の発展と将来展望', '会長就任にあたって', '情報の時代を勝手に俯瞰する', 'データの時代', '電波科学の100年と持続可能な発展への取り組みの道すがら、想うこと', '新・半導体戦略', 'EDFA 長い冒険の旅', '社会情報基盤を構築するための工学とは？', '移動体通信の未来', '本当の感覚通信を求めて', 'Smartcityによる自立分散社会の実現へ'} on IEICE webinar.
3 entries inserted.


# YouTubeから動画リストを取得する

In [None]:
from apiclient.discovery import build
from datetime import datetime

ignoreOrgs=set(config["youtube"]["ignoreOrgs"])

with DB(dbConfig=config["db"], init=False) as db:
    youtube = build('youtube', 'v3', developerKey=config["google"]["apiKey"])
    for channelId, orgName in config["youtube"]["channels"].items():
        if orgName in ignoreOrgs: continue
        db.deleteByTypeAndOrg("youtube", orgName)
        c = 0
        channel = youtube.channels().list(part="contentDetails", id=channelId).execute();
        pageId = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
        nextPageToken = None
        while True:
            videos = youtube.playlistItems().list(part="snippet", 
                playlistId=pageId, pageToken=nextPageToken).execute()
            nextPageToken = videos.get("nextPageToken", None)
            for item in videos["items"]:
                id = item["snippet"]["resourceId"]["videoId"]
                publishedAt = item["snippet"]["publishedAt"]
                title = stripSpaces(item["snippet"]["title"])
                desc = stripSpaces(item["snippet"]["description"])
                url = f"https://www.youtube.com/embed/{id}"
                thumbUrl = f"https://img.youtube.com/vi/{id}/mqdefault.jpg"
                stats = youtube.videos().list(part ="statistics", id=id).execute()["items"][0]["statistics"]
                db.insert(orgName, title, desc, 120, 90, title + " " + desc,
                          "youtube", url, True, "", "", thumbUrl,
                          stats.get("viewCount", -1),
                          stats.get("likeCount", -1),
                          stats.get("dislikeCount", -1),
                          stats.get("favoriteCount", -1),
                          stats.get("commentCount", -1),
                          datetime.fromisoformat(publishedAt.replace('Z', '+00:00')))
                c += 1
            if not nextPageToken: break
        print(f"{c} movies found on {orgName} channel.")


configに記述されているYouTubeチャンネルの一覧を出力する

In [4]:
for channelId, orgName in config["youtube"]["channels"].items():
    print(f"* {orgName} https://youtube.com/channel/{channelId}")

* 電子情報通信学会 https://youtube.com/channel/UCDMkjDeK_8WafkMvIfbtTKQ
* 情報処理学会 https://youtube.com/channel/UC8uVGnC5MTi4KnbJGNFUXog
* 情報処理学会インタラクションシンポジウム https://youtube.com/channel/UCVzAsIAtO8aVCV43QNF48XA
* sigec https://youtube.com/channel/UCg6i7YNuslvlN0_RlZQ-ggQ
* Dr. AyumiとDr. MoeのUBIチャンネル https://youtube.com/channel/UCtqftmHFhN6DsHGjfVqneAA
* DBSJ最強データベース講義 https://youtube.com/channel/UCaOkRhbjsqviiDQdKn-p0HA
* signl ipsj https://youtube.com/channel/UClzd97uGh4I_nwRM4hbgZqw
* AAC IPSJ https://youtube.com/channel/UCXWW29NBsGlvgsj9bNo_CoA
