In [None]:
import requests
import sqlite3
import pandas as pd
import time

# Add this after connecting to the database and before the data collection loop
cursor.execute("ALTER TABLE movies ADD COLUMN tag TEXT")


# Function to fetch data from the API for a specific tag
def fetch_data(tag, page_start):
    url = f"https://movie.douban.com/j/search_subjects?type=movie&tag={tag}&page_limit=50&page_start={page_start}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    return response.json().get('subjects', [])

# Connect to SQLite database
conn = sqlite3.connect('douban_movies.db')
cursor = conn.cursor()

# Drop the existing table if it exists
cursor.execute("DROP TABLE IF EXISTS movies")

# Create table with the updated schema
cursor.execute('''
CREATE TABLE movies (
    id TEXT PRIMARY KEY,
    title TEXT,
    rate TEXT,
    url TEXT,
    cover TEXT,
    is_new INTEGER,
    tag TEXT
)
''')

# List of tags to iterate through
tags = ["热门", "国产剧", "综艺", "美剧", "日剧", "韩剧", "日本动画", "纪录片"]

# Fetch and store data for each tag
total_movies = 0

for tag in tags:
    page_start = 0
    print(f"Collecting movies for tag: {tag}")

    while True:
        movies = fetch_data(tag, page_start)
        if not movies:
            break

        for movie in movies:
            cursor.execute('''
            INSERT OR REPLACE INTO movies (id, title, rate, url, cover, is_new, tag)
            VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (
                movie['id'],
                movie['title'],
                movie['rate'],
                movie['url'],
                movie['cover'],
                1 if movie['is_new'] else 0,
                tag
            ))

        conn.commit()
        total_movies += len(movies)
        print(f"Collected {len(movies)} movies for tag '{tag}'. Total: {total_movies}")

        page_start += 50
        time.sleep(1)  # To avoid overwhelming the server

print(f"Total movies collected: {total_movies}")

# Verify data in the database
cursor.execute("SELECT COUNT(*) FROM movies")
db_count = cursor.fetchone()[0]
print(f"Total movies in database: {db_count}")

# Read data into DataFrame
df = pd.read_sql_query("SELECT * FROM movies", conn)
print("\nDataFrame head:")
print(df.head())

print("\nDataFrame info:")
print(df.info())

# Close the database connection
conn.close()