# 定数・クラス定義

In [23]:
# 定数定義
DB_HOST="db"
DB_NAME="viblioteca"
DB_USER="viblioteca"
DB_PASS="viblioteca"
GOOGLE_API_KEY="YOUR_API_KEY_FOR_YOUTUBE"
YOUTUBE_CHANNELS = {
    "電子情報通信学会": "UCDMkjDeK_8WafkMvIfbtTKQ",
    "情報処理学会": "UC8uVGnC5MTi4KnbJGNFUXog"}

# クラス定義
import mysql.connector as my

class DB:
    def __init__(self, init=True):
        self.con = my.connect(host=DB_HOST, database=DB_NAME, user=DB_USER, password=DB_PASS)
        self.cur = self.con.cursor()
        if init:
            self.cur.execute("delete from contents;")
            self.cur.execute("ALTER TABLE contents AUTO_INCREMENT=1;")
            
    def __enter__(self):
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        self.cur.close()
        self.con.commit()
        self.con.close()
    
    def insert(self, org, title, description, w, h, indexTexts, type, url, embeddable, filename,
               tag, thumbUrl, viewCount, likeCount, dislikeCount, favoriteCount, commentCount):
        self.cur.execute(
            "insert into contents (org, title, description, width, height, indexText, type, url, "
            "embeddable, filename, tag, thumbUrl, viewCount, likeCount, dislikeCount, favoriteCount, commentCount) "
            "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
            (org, title, description, w, h, indexTexts, type, url, embeddable, filename, tag, thumbUrl,
            viewCount, likeCount, dislikeCount, favoriteCount, commentCount))

import re

def stripSpaces(str):
    return re.sub(r'([ 　]+)|\n', ' ', str).strip()

# 電子情報通信学会ウェビナーサイトをクロールする

In [None]:
!wget -q \
  -R doc,zip,jpg,gif,ico,png,JPG,GIF,PNG,jpeg,JPEG,pdf,PDF,svg,SVG,css,CSS,js,JS,mp4 \
  -r https://webinar.ieice.org/
!echo `ls -laR | grep -c '^-'` files downloaded

# 電子情報通信学会ウェビナーサイトからクロールしたデータをDBに取り込む

In [25]:
from bs4 import BeautifulSoup
from glob import iglob
import os
import re
import mysql.connector as my

with DB() as db:
    c = 0
    for filename in iglob("./webinar.ieice.org/**/*", recursive=True):
        if os.path.isdir(filename): continue
        soup = BeautifulSoup(open(filename, encoding="utf8", errors='ignore'), 'html.parser')
        iframe = soup.iframe
        if not iframe: continue
        title = soup.title.get_text()
        summary = soup.find(id='summary')
        w = iframe["width"]
        h = iframe["height"]
        id = iframe["src"].split("/")[-1:][0]
        #src="https://player.vimeo.com/video/489189921" 
        thumb = "nothumb.png"  #f"https://i.vimeocdn.com/video/{id}.webp?mw=900&mh=507"
        if summary:
            desc = stripSpaces(summary.get_text())
            indexTexts = stripSpaces(title) + ' ' + desc
            db.insert('電子情報通信学会', title, desc, w, h, indexTexts,
                      "custom", "https:/" + filename[1:], False,
                      filename, str(iframe), thumb, 0, 0, 0, 0, 0)
            c += 1
    print(f"{c} entries inserted.")


19 entries inserted.


# YouTubeから電子情報通信学会と情報処理学会の動画リストを取得する

In [26]:
from apiclient.discovery import build


with DB(init=False) as db:
    youtube = build('youtube', 'v3', developerKey=GOOGLE_API_KEY)
    for org, channelId in YOUTUBE_CHANNELS.items():
        c = 0
        channel = youtube.channels().list(part="contentDetails", id=channelId).execute();
        pageId = channel["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
        nextPageToken = None
        while True:
            videos = youtube.playlistItems().list(part="snippet", 
                playlistId=pageId, pageToken=nextPageToken).execute()
            nextPageToken = videos.get("nextPageToken", None)
            for item in videos["items"]:
                id = item["snippet"]["resourceId"]["videoId"]
                title = stripSpaces(item["snippet"]["title"])
                desc = stripSpaces(item["snippet"]["description"])
                url = f"https://www.youtube.com/embed/{id}"
                thumbUrl = f"https://img.youtube.com/vi/{id}/mqdefault.jpg"
                stats = youtube.videos().list(part ="statistics", id=id).execute()["items"][0]["statistics"]
                db.insert(org, title, desc, 120, 90, title + " " + desc,
                          "youtube", url, True, "", "", thumbUrl,
                          stats["viewCount"], stats["likeCount"], stats["dislikeCount"],
                          stats["favoriteCount"], stats["commentCount"])
                c += 1
            if not nextPageToken: break
        print(f"{c} movies found on {org} channel.")


39 movies found on 電子情報通信学会 channel.
76 movies found on 情報処理学会 channel.
