* 모델 목록 : https://platform.openai.com/docs/models
* 플레이그라운드에서 미리 사용해 보기 :
    * https://platform.openai.com/playground/chat?models=gpt-4o-mini 
    * https://platform.openai.com/playground/chat?models=gpt-4o
* API 키 발급 : https://platform.openai.com/api-keys
* 과금 확인 : https://platform.openai.com/usage

* [PyCon US 2024 - YouTube](https://www.youtube.com/playlist?list=PL2Uw4_HvXqvYhjub9bw4uDAmNtprgAvlJ)

In [None]:
# !pip install -Uq pytube

In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import logging
from openai import OpenAI
import os
from tqdm import tqdm
import time

In [None]:
# OpenAI API 키 설정
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# 로깅 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 데이터베이스 연결
conn = sqlite3.connect('youtube_playlist.db')
cursor = conn.cursor()

# 새로운 컬럼 추가 함수
def add_column_if_not_exists(table_name, column_name, column_type):
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = [column[1] for column in cursor.fetchall()]
    if column_name not in columns:
        cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_name} {column_type}")

# 새로운 컬럼 추가
add_column_if_not_exists('videos', 'subtitle_length', 'INTEGER')
add_column_if_not_exists('videos', 'improved_subtitles', 'TEXT')
add_column_if_not_exists('videos', 'korean_subtitles', 'TEXT')
add_column_if_not_exists('videos', 'korean_subtitle_length', 'INTEGER')

# 자막 길이 계산 및 업데이트 함수
def update_subtitle_length():
    cursor.execute('UPDATE videos SET subtitle_length = LENGTH(subtitles) WHERE subtitle_length IS NULL')
    conn.commit()

# 텍스트 청크 분할 함수
def chunk_text(text, max_length=1000):
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

# OpenAI API를 사용한 자막 개선 및 번역 함수
def improve_and_translate(text):
    chunks = chunk_text(text)
    improved_chunks = []
    translated_chunks = []

    for chunk in chunks:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "Improve the English text and translate it to Korean."},
                    {"role": "user", "content": f"Improve and translate: {chunk}"}
                ]
            )
            result = response.choices[0].message.content.split('\n\n')
            improved_chunks.append(result[0].replace("Improved English: ", ""))
            translated_chunks.append(result[1].replace("Korean translation: ", ""))
            time.sleep(1)  # API 요청 간 딜레이 추가
        except Exception as e:
            logging.error(f"Error in API call: {str(e)}")
            return None, None

    return " ".join(improved_chunks), " ".join(translated_chunks)

# 자막 개선 및 번역 실행
def process_subtitles():
    cursor.execute('SELECT id, subtitles FROM videos WHERE improved_subtitles IS NULL OR korean_subtitles IS NULL')
    rows = cursor.fetchall()
    
    for row in tqdm(rows, desc="Processing subtitles"):
        id, subtitles = row
        improved, translated = improve_and_translate(subtitles)
        if improved and translated:
            cursor.execute('''
                UPDATE videos 
                SET improved_subtitles = ?, 
                    korean_subtitles = ?, 
                    korean_subtitle_length = LENGTH(?)
                WHERE id = ?
            ''', (improved, translated, translated, id))
            conn.commit()

# 메인 실행
if __name__ == "__main__":
    update_subtitle_length()
    logging.info("Subtitle lengths updated.")
    
    process_subtitles()
    logging.info("Subtitles improved and translated.")
    
    # 결과 확인
    df = pd.read_sql_query("SELECT * FROM videos", conn)
    print(df)

    conn.close()

In [None]:
# 데이터베이스 연결
conn = sqlite3.connect('youtube_playlist.db')
cursor = conn.cursor()

df = pd.read_sql_query("SELECT * FROM videos", conn)
conn.close()
df

In [None]:
df.info()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import re

# 시분초 형식 [00:00:00] 제거 함수
def remove_timestamps(text):
    return re.sub(r'\[\d{2}:\d{2}:\d{2}\]', '', text)
    
# title과 subtitles 결합
df['text'] = df['title'] + ' ' + df['subtitles']
df['text'] = df['text'].map(remove_timestamps)

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_matrix = vectorizer.fit_transform(df['text'])

# 단어와 TF-IDF 점수를 데이터프레임으로 변환
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# 평균 TF-IDF 점수 계산
mean_tfidf = tfidf_df.mean().sort_values(ascending=False)

# 시각화
plt.figure(figsize=(12, 4))
sns.barplot(x=mean_tfidf.index, y=mean_tfidf.values)
plt.title('Top Keywords by TF-IDF Score')
plt.xlabel('Keywords')
plt.ylabel('Average TF-IDF Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 상위 키워드와 점수 출력
mean_tfidf.head(50)

In [None]:
from sklearn.cluster import KMeans

# K-means 군집화
num_clusters = 3  # 원하는 군집 수 지정
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_matrix)

# 군집별 키워드 빈도 계산 함수
def get_top_keywords(cluster_id, n=30):
    cluster_docs = tfidf_matrix[cluster_labels == cluster_id]
    cluster_words = vectorizer.inverse_transform(cluster_docs)
    words = [word for doc in cluster_words for word in doc]
    word_freq = pd.Series(words).value_counts().nlargest(n)
    return word_freq.to_dict()

# 각 군집별 상위 키워드 추출
cluster_keywords = {i: get_top_keywords(i) for i in range(num_clusters)}

# 데이터프레임 생성
df_keywords = pd.DataFrame(cluster_keywords).fillna(0)
df_keywords = df_keywords.sort_values(by=df_keywords.columns.tolist(), ascending=False)

# 막대 그래프 시각화
plt.figure(figsize=(20, 5))
df_keywords.plot(kind='bar', stacked=True)
plt.title('Top Keywords by Cluster')
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# 각 군집의 키워드 출력
for cluster, keywords in cluster_keywords.items():
    print(f"Cluster {cluster}:")
    for word, freq in keywords.items():
        print(f"  {word}: {freq}")
    print()

In [None]:
df.iloc[-5]["subtitles"][:300]

In [None]:
df.iloc[-5]

In [None]:
df[df["korean_subtitles"].notnull()].iloc[-1]

In [None]:
df[df["korean_subtitle_length"] > 20]

In [None]:
df["subtitle_length"].hist();

In [None]:
df.nlargest(10, "subtitle_length")