### API 사용하기 

* https://www.inflearn.com/courses

* https://www.inflearn.com/courses/client/api/v1/course/search?isDiscounted=false&isNew=false&pageNumber=3&pageSize=60&types=ONLINE

In [None]:
import requests
import sqlite3
import time
from datetime import datetime
import pandas as pd

today = datetime.today().strftime("%Y-%m-%d")
db_file = f'inflearn_courses-{today}.db'
db_file

In [None]:
def create_database():
    conn = sqlite3.connect(db_file)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS courses
                 (id INTEGER PRIMARY KEY, 
                  title TEXT, 
                  instructor TEXT, 
                  payPrice INTEGER, 
                  regularPrice INTEGER, 
                  discountRate REAL,
                  studentCount INTEGER, 
                  reviewCount INTEGER, 
                  star REAL,
                  skills TEXT,
                  created_at TEXT)''')

    conn.commit()
    return conn

def fetch_courses(page_number, page_size=60):
    url = f"https://www.inflearn.com/courses/client/api/v1/course/search"
    params = {
        "isDiscounted": "false",
        "isNew": "false",
        "pageNumber": page_number,
        "pageSize": page_size,
        "types": "ONLINE"
    }
    response = requests.get(url, params=params)
    return response.json()

In [None]:
data = fetch_courses(page_number=1, page_size=60)
# data["data"]["items"][0]["instructor"]["name"]
course_inst = data["data"]["items"][0]

course = course_inst["course"]
instructor = course_inst["instructor"]
listPrice = course_inst["listPrice"]
print(course.get('id'),
       course.get('title'),
       instructor.get('name'),
       listPrice.get('payPrice'),
       listPrice.get('regularPrice'),
       listPrice.get('discountRate'),
       course.get('studentCount'),
       course.get('reviewCount'),
       course.get('star'),
       course.get('metadata')["skills"],
       datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

In [None]:
def scrape_inflearn():
    conn = create_database()
    c = conn.cursor()

    page_number = 1
    total_courses = 0

    while True:
        data = fetch_courses(page_number)
        courses = data["data"].get('items', [])

        if not courses:
            break

        for course_inst in courses:
            course = course_inst["course"]
            instructor = course_inst["instructor"]
            listPrice = course_inst["listPrice"]
                
            skills = ','.join(course.get('metadata').get("skills"))
        
            c.execute('''INSERT OR REPLACE INTO courses VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                      (course.get('id'),
                       course.get('title'),
                       instructor.get('name'),
                       listPrice.get('payPrice'),
                       listPrice.get('regularPrice'),
                       listPrice.get('discountRate'),
                       course.get('studentCount'),
                       course.get('reviewCount'),
                       course.get('star'),
                       skills,
                       datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

        conn.commit()
        total_courses += len(courses)
        print(f"Collected {len(courses)} courses from page {page_number}. Total: {total_courses}")

        page_number += 1
        time.sleep(1)  # 서버에 부담을 주지 않기 위한 딜레이

    conn.close()
    print(f"Scraping completed. Total courses collected: {total_courses}")

if __name__ == '__main__':
    scrape_inflearn()

In [None]:
def test_database():
    # 데이터베이스 연결
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # 총 레코드 수 확인
    cursor.execute("SELECT COUNT(*) FROM courses")
    total_records = cursor.fetchone()[0]
    print(f"총 레코드 수: {total_records}")

    # 무작위로 5개 레코드 출력 (5개 미만이면 전체 출력)
    sample_size = min(5, total_records)
    cursor.execute(f"SELECT * FROM courses ORDER BY RANDOM() LIMIT {sample_size}")
    sample_records = cursor.fetchall()
    
    print("\n무작위 샘플 레코드:")
    for record in sample_records:
        print(record)

    # 각 컬럼의 기본 통계 계산 (수치형 컬럼에 대해)
    numeric_columns = ['price', 'regularPrice', 'discountRate', 'studentCount', 'reviewCount', 'star']

    # 각 컬럼의 null 값 개수 확인
    cursor.execute("PRAGMA table_info(courses)")
    columns = [column[1] for column in cursor.fetchall()]
    
    print("\n각 컬럼의 null 값 개수:")
    for column in columns:
        cursor.execute(f"SELECT COUNT(*) FROM courses WHERE {column} IS NULL")
        null_count = cursor.fetchone()[0]
        print(f"{column}: {null_count}")

    # 가장 최근에 업데이트된 레코드와 가장 오래된 레코드 확인
    cursor.execute("SELECT * FROM courses ORDER BY created_at DESC LIMIT 1")
    print("\n마지막 레코드:")
    print(cursor.fetchone())
    
    cursor.execute("SELECT * FROM courses ORDER BY created_at ASC LIMIT 1")
    print("\n처음 레코드:")
    print(cursor.fetchone())

    # 연결 종료
    conn.close()

if __name__ == '__main__':
    test_database()

In [None]:
# 데이터베이스 연결
conn = sqlite3.connect(db_file)

# SQL 쿼리 실행 및 결과를 데이터프레임으로 변환
df = pd.read_sql_query("SELECT * FROM courses", conn)

# 데이터프레임 출력
display(df)

# 데이터 개수 확인
print(f"총 {len(df)}개의 강의가 저장되어 있습니다.")

# 연결 종료
conn.close()