* [pytube/pytube: A lightweight, dependency-free Python library (and command-line utility) for downloading YouTube Videos.](https://github.com/pytube/pytube)
* [Using Playlists — pytube documentation](https://pytube.io/en/latest/user/playlist.html)

In [None]:
import sqlite3
from pytube import Playlist, YouTube
import time
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# SQLite 연결
conn = sqlite3.connect('youtube_videos.db')
cur = conn.cursor()

# 테이블 생성
cur.execute('''
    CREATE TABLE IF NOT EXISTS playlist (
        year INTEGER,
        title TEXT,
        url TEXT
    )
''')

cur.execute('''
    CREATE TABLE IF NOT EXISTS video (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        playlist_year INTEGER,
        playlist_title TEXT,
        video_url TEXT,
        title TEXT,
        length INTEGER,
        description TEXT,
        views INTEGER,
        rating REAL,
        publish_date TEXT,
        thumbnail_url TEXT,
        author TEXT,
        keywords TEXT,
        detailed_info_updated INTEGER DEFAULT 0
    )
''')

conn.commit()

# 플레이리스트에서 비디오 URL 수집
def get_playlist_videos(url, max_retries=3):
    if not url:
        return []
    
    videos = []
    retries = 0
    
    while retries < max_retries:
        try:
            playlist = Playlist(url)
            for video_url in playlist.video_urls:
                videos.append({
                    "title": playlist.title,
                    "url": video_url
                })
            return videos
        except Exception as e:
            logging.warning(f"Error fetching playlist {url}: {str(e)}")
            retries += 1
            time.sleep(5)
    
    logging.error(f"Failed to fetch playlist {url} after {max_retries} attempts")
    return []

# 플레이리스트와 비디오 저장
def save_playlist_and_videos(pyconkr_playlist):
    for year, url in pyconkr_playlist.items():
        logging.info(f"Collecting videos for {year}...")
        videos = get_playlist_videos(url)
        
        # 비디오가 없는 경우 예외 처리
        if not videos:
            logging.warning(f"No videos found for the playlist of {year}. Skipping.")
            continue
        
        try:
            # 플레이리스트 정보 저장
            cur.execute("INSERT INTO playlist (year, title, url) VALUES (?, ?, ?)", (year, videos[0]["title"], url))
            playlist_title = videos[0]["title"]
            
            # 비디오 정보 저장
            for video in videos:
                cur.execute('''
                    INSERT INTO video (playlist_year, playlist_title, video_url)
                    VALUES (?, ?, ?)
                ''', (year, playlist_title, video["url"]))
            conn.commit()
        except IndexError as e:
            logging.error(f"IndexError: Could not save playlist for year {year} - {str(e)}")
        except Exception as e:
            logging.error(f"Error occurred while saving videos for {year}: {str(e)}")


# 비디오 상세 정보를 업데이트
def update_video_info():
    cur.execute("SELECT id, video_url FROM video WHERE detailed_info_updated = 0")
    videos_to_update = cur.fetchall()
    
    for video_id, url in videos_to_update:
        logging.info(f"Processing: {url}")
        info = get_video_info(url)
        
        if info:
            cur.execute('''
                UPDATE video
                SET title = ?, length = ?, description = ?, views = ?, rating = ?, publish_date = ?, 
                    thumbnail_url = ?, author = ?, keywords = ?, detailed_info_updated = 1
                WHERE id = ?
            ''', (info["title"], info["length"], info["description"], info["views"], info["rating"],
                  info["publish_date"], info["thumbnail_url"], info["author"], ",".join(info["keywords"]), video_id))
            conn.commit()
        time.sleep(1)  # YouTube 서버에 부담을 주지 않기 위한 지연

# 비디오 정보 가져오기
def get_video_info(url):
    try:
        yt = YouTube(url)
        video_info = {
            "title": yt.title,
            "length": yt.length,
            "description": yt.description,
            "views": yt.views,
            "rating": yt.rating,
            "publish_date": yt.publish_date.strftime('%Y-%m-%d') if yt.publish_date else None,
            "thumbnail_url": yt.thumbnail_url,
            "author": yt.author,
            "keywords": yt.keywords
        }
        return video_info
    except Exception as e:
        logging.error(f"Error processing {url}: {str(e)}")
        return None

# 파이콘 코리아 플레이리스트 URL 목록
pyconkr_playlist = {
    2023: "https://www.youtube.com/watch?v=dJlX0i-q4ck&list=PLZPhyNeJvHRllQiXsJAryqWmqWrwFxY8I",
    2022: "https://www.youtube.com/watch?v=5NjMaxYQuIc&list=PLZPhyNeJvHRnlqQwMj-WNlrsac7yTiVhk",
    2021: "",
    2020: "https://www.youtube.com/watch?v=xs66jubM88k&list=PLZPhyNeJvHRk9wIL9rZekFLIfT3aVcHT7",
    2019: "https://www.youtube.com/watch?v=b2BFxbkXkKY&list=PLZPhyNeJvHRlECdmkJ7M8konKB0NhBfve",
    2018: "https://www.youtube.com/watch?v=9lNN9wMD-60&list=PLZPhyNeJvHRmnMr5yucZ9Eu-yVhjRRsOM",
    2017: "https://www.youtube.com/watch?v=MmIxahj9vnY&list=PLZPhyNeJvHRmvCnWMBZJiFXu9kDUcn5FG",
    2016: "https://www.youtube.com/watch?v=UWDRX4z4-k0&list=PLZPhyNeJvHRnSJ2sAnqCGFnVRKo98EgCp",
    2015: "https://www.youtube.com/watch?v=0abmVNlkxRo&list=PLZPhyNeJvHRnoO_m1hH78j0JRj8LgUICN",
    2014: "https://www.youtube.com/watch?v=JGkfzWhVvKk&list=PLZPhyNeJvHRnchPDpnFV1uUmLhR_JG3A8",
}

# 실행
# save_playlist_and_videos(pyconkr_playlist)
# update_video_info()

conn.close()
logging.info("프로그램이 완료되었습니다.")


In [None]:
# 추가 수집용
# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(message)s')

# SQLite 연결
conn = sqlite3.connect('db_pycon_youtube_videos.db')
cur = conn.cursor()

update_video_info()

conn.close()

In [None]:
# import sqlite3
# import pandas as pd

# SQLite 데이터베이스 연결
conn = sqlite3.connect('db_pycon_youtube_videos.db')

# 데이터베이스의 모든 테이블 이름 가져오기
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# 각 테이블을 DataFrame으로 변환
dataframes = {}

for table_name in tables:
    table_name = table_name[0]
    df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
    dataframes[table_name] = df
    print(f"Table '{table_name}' loaded into DataFrame. Shape: {df.shape}")

# 연결 종료
conn.close()

# 각 DataFrame 확인
for table_name, df in dataframes.items():
    print(f"\nPreview of '{table_name}' DataFrame:")
    print(df.head())
    print(f"Columns: {df.columns.tolist()}")

In [None]:
df_video = dataframes['video']
df_video.shape

In [None]:
df_video.info()

In [None]:
df_video.describe()

In [None]:
df_video.nlargest(10, 'views')

In [None]:
df_video['title']