# DataFlix

In [1]:
import os
import pandas as pd
import plotly.express as px
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv(dotenv_path='./private/.env')

user = os.getenv("SNOWFLAKE_USER")
password = os.getenv("SNOWFLAKE_PASSWORD")
account = os.getenv("SNOWFLAKE_ACCOUNT")
warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")
database = os.getenv("SNOWFLAKE_DATABASE")
schema = os.getenv("SNOWFLAKE_SCHEMA")

engine = create_engine(
    f'snowflake://{user}:{password}@{account}/{database}/{schema}?warehouse={warehouse}'
)

def run_query(_query: str) -> pd.DataFrame:
    if engine is None:
        raise RuntimeError("Failed to create SQLAlchemy engine.")
    return pd.read_sql(_query, engine)

In [2]:
# check average vote count and average rating distribution
import pandas as pd
import altair as alt

query = """
SELECT average_rating, num_votes
FROM staging.fact_ratings SAMPLE (.3)
WHERE average_rating IS NOT NULL
AND num_votes IS NOT NULL

"""
df = run_query(query)  

chart = alt.Chart(df).mark_circle(opacity=0.4).encode(
    x=alt.X('num_votes:Q', scale=alt.Scale(type='log')),
    y='average_rating:Q'
).properties(width=600, height=400)

chart.show()


In [3]:
table_name = "staging.fact_titles"
df = run_query(f"select * from {table_name} limit 100")
df

Unnamed: 0,title_id,primary_title,original_title,title_type,is_adult,start_year,end_year,runtime_minutes,genres,average_rating,num_votes,director_names,writer_names,ingested_at,load_ts
0,tt0661671,Episode dated 16 February 2003,Episode dated 16 February 2003,tvEpisode,False,2003,,,"Comedy,Talk-Show",,,,"Jaime Bauzá, David Navas",2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
1,tt0661672,Episode dated 23 February 2003,Episode dated 23 February 2003,tvEpisode,False,2003,,,"Comedy,Talk-Show",,,,"Jaime Bauzá, David Navas",2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
2,tt0661673,Episode dated 9 March 2003,Episode dated 9 March 2003,tvEpisode,False,2003,,,"Comedy,Talk-Show",,,,"Jaime Bauzá, David Navas",2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
3,tt0661674,Episode dated 16 March 2003,Episode dated 16 March 2003,tvEpisode,False,2003,,,"Comedy,Talk-Show",,,,"Jaime Bauzá, David Navas",2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
4,tt0661675,Episode dated 13 April 2003,Episode dated 13 April 2003,tvEpisode,False,2003,,,"Comedy,Talk-Show",,,,"Jaime Bauzá, David Navas",2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,tt0661766,Episode dated 16 February 2002,Episode dated 16 February 2002,tvEpisode,False,2002,,,"Comedy,Music",,,José Luis Moreno,,2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
96,tt0661767,Episode dated 23 February 2002,Episode dated 23 February 2002,tvEpisode,False,2002,,,"Comedy,Music",,,José Luis Moreno,,2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
97,tt0661768,Episode dated 9 March 2002,Episode dated 9 March 2002,tvEpisode,False,2002,,,"Comedy,Music",,,José Luis Moreno,,2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
98,tt0661769,Episode dated 16 March 2002,Episode dated 16 March 2002,tvEpisode,False,2002,,,"Comedy,Music",,,José Luis Moreno,,2025-07-18 02:01:51.213000+00:00,2025-07-18 02:49:18.302000+00:00
