In [4]:
import os
import requests
import gzip
import shutil
import sqlite3
import pandas as pd

In [1]:


# URL to download the datasets
url = "https://datasets.imdbws.com/"

# List of files to download
files = [
    "title.akas.tsv.gz",
    "title.basics.tsv.gz",
    "title.crew.tsv.gz",
    "title.episode.tsv.gz",
    "title.principals.tsv.gz",
    "title.ratings.tsv.gz",
    "name.basics.tsv.gz"
]

# Directory to save files
directory = "imdb_cache"
if not os.path.exists(directory):
    os.makedirs(directory)

# Function to download and extract a file
def download_extract_file(file_name):
    response = requests.get(url + file_name, stream=True)
    with open(os.path.join(directory, file_name), 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    with gzip.open(os.path.join(directory, file_name), 'rb') as f_in:
        with open(os.path.join(directory, file_name[:-3]), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(os.path.join(directory, file_name))  # remove the gzipped file

# Download and extract all files
for file in files:
    print(f"Downloading and extracting {file}...")
    download_extract_file(file)

# Connect to SQLite database
conn = sqlite3.connect('imdb.db')

# Read each TSV file into a pandas DataFrame and write to the database
for file in files:
    print(f"Loading {file} into database...")
    df = pd.read_csv(os.path.join(directory, file[:-3]), delimiter='\t', low_memory=False)
    # Replace '.' with '_' in table names and remove '.tsv' from the end
    table_name = file.replace('.', '_')[:-7]
    df.to_sql(table_name, conn, if_exists='replace')

# Close the connection
conn.close()


Downloading and extracting title.akas.tsv.gz...
Downloading and extracting title.basics.tsv.gz...
Downloading and extracting title.crew.tsv.gz...
Downloading and extracting title.episode.tsv.gz...
Downloading and extracting title.principals.tsv.gz...
Downloading and extracting title.ratings.tsv.gz...
Downloading and extracting name.basics.tsv.gz...
Loading title.akas.tsv.gz into database...
Loading title.basics.tsv.gz into database...
Loading title.crew.tsv.gz into database...
Loading title.episode.tsv.gz into database...
Loading title.principals.tsv.gz into database...
Loading title.ratings.tsv.gz into database...
Loading name.basics.tsv.gz into database...


In [2]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('imdb.db')

# Create a cursor
cur = conn.cursor()

# Execute a query to get all table names
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")

# Fetch all results
tables = cur.fetchall()

# Print each table name
for table in tables:
    print(table[0])

# Close the connection
conn.close()


title_akas
title_basics
title_crew
title_episode
title_principals
title_ratings
name_basics


In [5]:
import sqlite3
import pandas as pd

# Connect to SQLite database
conn = sqlite3.connect('imdb.db')

# List of tables in the database
tables = [
    "title_akas",
    "title_basics",
    "title_crew",
    "title_episode",
    "title_principals",
    "title_ratings",
    "name_basics"
]

# Query and print the first few rows of each table
for table in tables:
    print(f"Showing first few rows of table {table}:")
    df = pd.read_sql_query(f"SELECT * from {table} LIMIT 5", conn)
    print(df)
    print("\n")

# Close the connection
conn.close()


Showing first few rows of table title_akas:
   index    titleId  ordering                      title region language  \
0      0  tt0000001         1                 Карменсіта     UA       \N   
1      1  tt0000001         2                 Carmencita     DE       \N   
2      2  tt0000001         3  Carmencita - spanyol tánc     HU       \N   
3      3  tt0000001         4                 Καρμενσίτα     GR       \N   
4      4  tt0000001         5                 Карменсита     RU       \N   

         types     attributes isOriginalTitle  
0  imdbDisplay             \N               0  
1           \N  literal title               0  
2  imdbDisplay             \N               0  
3  imdbDisplay             \N               0  
4  imdbDisplay             \N               0  


Showing first few rows of table title_basics:
   index     tconst titleType            primaryTitle           originalTitle  \
0      0  tt0000001     short              Carmencita              Carmencita   
1

In [6]:
import sqlite3
import pandas as pd

# Connect to SQLite database
conn = sqlite3.connect('imdb.db')

# List of tables in the database
tables = [
    "title_akas",
    "title_basics",
    "title_crew",
    "title_episode",
    "title_principals",
    "title_ratings",
    "name_basics"
]

# Create an empty dictionary to store all schemas
schemas = {}

# Query and store the schema of each table
for table in tables:
    df = pd.read_sql_query(f"PRAGMA table_info({table})", conn)
    schemas[table] = df

# Close the connection
conn.close()

# Now the 'schemas' dictionary contains the schema of each table as a DataFrame
for table, schema in schemas.items():
    print(f"Schema of {table}:")
    print(schema)
    print("\n")


Schema of title_akas:
   cid             name     type  notnull dflt_value  pk
0    0            index  INTEGER        0       None   0
1    1          titleId     TEXT        0       None   0
2    2         ordering  INTEGER        0       None   0
3    3            title     TEXT        0       None   0
4    4           region     TEXT        0       None   0
5    5         language     TEXT        0       None   0
6    6            types     TEXT        0       None   0
7    7       attributes     TEXT        0       None   0
8    8  isOriginalTitle     TEXT        0       None   0


Schema of title_basics:
   cid            name     type  notnull dflt_value  pk
0    0           index  INTEGER        0       None   0
1    1          tconst     TEXT        0       None   0
2    2       titleType     TEXT        0       None   0
3    3    primaryTitle     TEXT        0       None   0
4    4   originalTitle     TEXT        0       None   0
5    5         isAdult     TEXT        0      

In [9]:
import sqlite3
import pandas as pd

# Connect to SQLite database
conn = sqlite3.connect('imdb.db')

# List of tables in the database
tables = [
    "name_basics"
]

# Query and print the first few rows of each table
for table in tables:
    print(f"Showing first few rows of table {table}:")
    df = pd.read_sql_query(f"SELECT * from {table} WHERE primaryName = 'Fred Astaire'", conn)
    print(df)
    print("\n")

# Close the connection
conn.close()


Showing first few rows of table name_basics:
     index      nconst   primaryName birthYear deathYear  \
0        0   nm0000001  Fred Astaire      1899      1987   
1  3381711  nm12584561  Fred Astaire        \N        \N   

                primaryProfession                           knownForTitles  
0  soundtrack,actor,miscellaneous  tt0053137,tt0050419,tt0031983,tt0072308  
1                            None                                       \N  




In [10]:
import sqlite3
import pandas as pd

# Connect to SQLite database
conn = sqlite3.connect('imdb.db')

# List of tables in the database
tables = [
    "name_basics"
]

# Query and print the first few rows of each table
for table in tables:
    print(f"Showing first few rows of table {table}:")
    df = pd.read_sql_query(f"SELECT * from {table} WHERE nconst = 'nm0680983'", conn)
    print(df)
    print("\n")

# Close the connection
conn.close()


Showing first few rows of table name_basics:
    index     nconst  primaryName birthYear deathYear  \
0  642186  nm0680983  Elliot Page      1987        \N   

           primaryProfession                           knownForTitles  
0  actor,producer,soundtrack  tt1375666,tt1877832,tt0424136,tt0467406  


