## This script is used to clean the database. Do not run if you don't know what you are doing.

In [12]:
import sqlite3
import pandas as pd

# Connect to the database
db_path = '/Users/yuanlu/Code/test/youtube-top-10000-channels/data/output-raw-1129.db'
conn = sqlite3.connect(db_path)

# Print tables in database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in database:")
for table in tables:
    print(f"- {table[0]}")


Tables in database:
- unique_youtube_channel_urls


In [2]:
# get column names of combined_data table
cursor.execute("PRAGMA table_info(unique_youtube_channel_urls)")
column_names = [row[1] for row in cursor.fetchall()]
print(f"Column names in combined_data table: {column_names}")


Column names in combined_data table: ['Rank', 'Channel_Handle', 'Channel_ID', 'Subscribers', 'YouTube_Channel_URL', 'source_file']


In [3]:
# Get size of combined_data table
cursor.execute("SELECT COUNT(*) FROM unique_youtube_channel_urls")
row_count = cursor.fetchone()[0]
print(f"Size of combined_data table: {row_count:,} rows")


Size of combined_data table: 121,980 rows


In [4]:
cursor.execute("SELECT COUNT(DISTINCT youtube_channel_url) FROM unique_youtube_channel_urls")
unique_count = cursor.fetchone()[0]
print(f"Number of unique youtube_channel_urls: {unique_count}")

Number of unique youtube_channel_urls: 121980


In [5]:
# Percentage of unique youtube_channel_urls
print(f"Percentage of unique youtube_channel_urls: {unique_count / row_count * 100:.2f}%")


Percentage of unique youtube_channel_urls: 100.00%


In [50]:
# Drop the unique_youtube_channel_urls table if it exists
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    cursor.execute("DROP TABLE IF EXISTS unique_youtube_channel_urls")
    conn.commit()

In [51]:
# Create a new table with unique youtube_channel_urls, it should have the same columns as combined_data. Make sure youtube_channel_url is unique.
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE unique_youtube_channel_urls AS 
        SELECT *
        FROM combined_data
        GROUP BY youtube_channel_url
    """)
    conn.commit()

In [13]:
# how many channels have more than 100,000 subscribers?
cursor.execute("SELECT COUNT(*) FROM unique_youtube_channel_urls WHERE subscribers > 100000")
more_than_100k_subscribers = cursor.fetchone()[0]
print(f"Number of channels with more than 100,000 subscribers: {more_than_100k_subscribers}")


Number of channels with more than 100,000 subscribers: 53148
