# **Import Dependencies**

In [42]:
# Setup
from dotenv import load_dotenv
import os
import sys

load_dotenv()
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Dataframe
import numpy as np
import pandas as pd

# SQL
from sqlalchemy import text

# Visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from src.plot_utils import *
from ipywidgets import IntSlider, SelectMultiple, ToggleButtons, Dropdown

# Utilities
from src.db import get_engine, run_sql, execute_sql

# Constants
MY_SQL_PASSWORD = os.getenv("MYSQL_DB_PASSWORD")

# **Creating The Engine**

In [2]:
engine = get_engine(f"mysql+mysqlconnector://root:{MY_SQL_PASSWORD}@localhost:3306/mco1_imdb")

# **Dimensional Model**
Showcases the fact table and corresponding dimension tables. Tables are limited to 10 rows

In [5]:
ROW_LIMIT = 10

In [6]:
facttitle_df = run_sql(f"SELECT * FROM facttitle LIMIT {ROW_LIMIT}", engine)
dimtitle_df = run_sql(f"SELECT * FROM dimtitle LIMIT {ROW_LIMIT}", engine)
dimgenre_df = run_sql(f"SELECT * FROM dimgenre LIMIT {ROW_LIMIT}", engine)
dimepisode_df = run_sql(f"SELECT * FROM dimepisode LIMIT {ROW_LIMIT}", engine)


In [7]:
facttitle_df

Unnamed: 0,tconst,averageRating,numVotes,isAdult,titleKey,episodeKey,releaseYearKey
0,tt0000001,5.7,2181,0,870789,,1894
1,tt0000002,5.5,301,0,2637154,,1892
2,tt0000003,6.4,2253,0,3508174,,1892
3,tt0000004,5.2,194,0,4898427,,1892
4,tt0000005,6.2,2991,0,713055,,1893
5,tt0000006,5.0,220,0,945546,,1894
6,tt0000007,5.3,930,0,1044636,,1894
7,tt0000008,5.3,2342,0,1400419,,1894
8,tt0000009,5.3,229,0,3002067,,1894
9,tt0000010,6.8,8096,0,2658594,,1895


In [8]:
dimtitle_df

Unnamed: 0,titleKey,primaryTitle,originalTitle,titleType
0,1,_ _ _ _ Love,_ _ _ _ Love,short
1,2,_ _ _' _ Peephole,_ _ _' _ Peephole,short
2,3,_____,_____,tvMovie
3,4,________,________,short
4,5,__thebowlinalley,__thebowlinalley,short
5,6,_.,_.,short
6,7,_(¯ ~ ¯)_/,_(¯ ~ ¯)_/,tvEpisode
7,8,_+ Space Positive,_+ Space Positive,short
8,9,_ALONE,_ALONE,tvEpisode
9,10,"_ash Aria_, Insergent, Bradley","_ash Aria_, Insergent, Bradley",tvEpisode


In [9]:
dimgenre_df

Unnamed: 0,genreKey,genre
0,18,Action
1,28,Adult
2,17,Adventure
3,3,Animation
4,11,Biography
5,4,Comedy
6,14,Crime
7,1,Documentary
8,8,Drama
9,16,Family


In [10]:
dimepisode_df

Unnamed: 0,episodeKey,parentTconst,seasonNumber,episodeNumber
0,1,tt0035599,1,1
1,2,tt0035803,6,1
2,3,tt0035803,6,2
3,4,tt0035803,6,3
4,5,tt0035803,6,4
5,6,tt0035803,6,5
6,7,tt0035803,6,6
7,8,tt0038276,1,1
8,9,tt0039120,1,1
9,10,tt0039122,1,1


# **Unoptimized Queries**

### **Analytical Report 1: *Overall Titles Year-Centered Report***

#### **Query Result**

In [6]:
params_1 = {
    "topN": 5,
    "titleTypes": "short,tvSeries",
    "isAdult": 0,
    "genreList": "Adventure,Action",
    "votes": 1000,
    "yearDimension": "year",
    "year1": 2000,
    "year2": 2010,
}

query_1 = """
WITH baseTable AS (
	SELECT t.primaryTitle, t.titleType, f.isAdult, f.averageRating, f.numVotes,
		CASE
		  WHEN :yearDimension = 'year' THEN ry.releaseYear
		  WHEN :yearDimension = 'decade' THEN ry.decade
		  ELSE ry.century
		END AS period,
		(
		  SELECT GROUP_CONCAT(DISTINCT g.genre ORDER BY g.genre SEPARATOR ', ')
		  FROM BridgeTitleGenre btg
		  JOIN DimGenre g ON btg.genreKey = g.genreKey
		  WHERE btg.tconst = f.tconst
		) AS genres
    FROM FactTitle f
    JOIN DimTitle t ON t.titleKey = f.titleKey
    JOIN DimReleaseYear ry ON ry.releaseYearKey = f.releaseYearKey
    WHERE 
			(:votes IS NULL OR f.numVotes >= :votes)
		AND (:isAdult IS NULL OR f.isAdult = :isAdult)
		AND (:titleTypes IS NULL OR FIND_IN_SET(t.titleType, :titleTypes) > 0)
        AND EXISTS (
			SELECT 1
            FROM BridgeTitleGenre btg
            JOIN DimGenre g on g.genreKey = btg.genreKey
            WHERE btg.tconst = f.tconst
				AND (:genreList IS NULL OR FIND_IN_SET(g.genre, :genreList) > 0)
		)
		AND (:year1 IS NULL OR (
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END) >= :year1)
		AND (:year2 IS NULL OR (
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END) <= :year2)
),
ranked AS (
	SELECT period, primaryTitle, titleType, isAdult, genres, averageRating, numVotes,
		ROW_NUMBER() OVER (
		  PARTITION BY period
		  ORDER BY averageRating DESC, numVotes DESC
		) AS rn
	FROM baseTable
)
SELECT period, primaryTitle, titleType, genres, isAdult, averageRating, numVotes
FROM ranked
WHERE rn <= :topN 
ORDER BY period DESC, rn;
"""


In [13]:

result_1 = run_sql(query_1, engine, params=params_1)

result_1.head()

Unnamed: 0,period,primaryTitle,titleType,genres,isAdult,averageRating,numVotes
0,2010,Dragonslayer Doppelgänger,short,"Action, Drama, Short",0,8.9,1444
1,2010,Cow Hard,short,"Action, Animation, Comedy",0,8.8,1913
2,2010,Le Morbite,short,"Adventure, Short",0,8.8,1690
3,2010,Justified,tvSeries,"Action, Crime, Drama",0,8.6,127536
4,2010,Adventure Time,tvSeries,"Action, Adventure, Animation",0,8.6,127261


#### **Interactive Visualization**

In [7]:
def plot_top_titles_overall(df):
    df['period'] = df['period'].astype(str).str.replace('period=', '')
    df = df.sort_values(by=["period", "averageRating"], ascending=[True, False])

    fig = px.bar(
        df,
        x="primaryTitle",
        y="averageRating",
        color="titleType",
        animation_frame="period",
        hover_data=["genres", "numVotes"],
        title="Top Titles by Rating (Animated by Period)"
    )

    fig.update_layout(
        height=750, 
        margin=dict(l=50, r=50, t=80, b=220),  
        xaxis_tickangle=-45,
        transition={'duration': 500},
        showlegend=True
    )

    fig.update_xaxes(tickangle=-45, tickfont=dict(size=11))

    return fig


param_config_1 = {
    "topN": {"widget": IntSlider(value=5, min=1, max=20, description="Top N:")},
    "titleTypes": {
        "widget": SelectMultiple(
            options=["movie", "tvSeries", "short"],
            value=["tvSeries"],
            description="Title Types:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "isAdult": {
        "widget": ToggleButtons(
            options=[("All", None), ("Non-Adult", 0), ("Adult", 1)],
            description="Adult:"
        )
    },
    "genreList": {
        "widget": SelectMultiple(
            options=["Action", "Adventure", "Comedy", "Drama", "Horror"],
            value=["Action"],
            description="Genres:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "votes": {"widget": IntSlider(value=1000, min=0, max=100000, step=500, description="Min Votes:")},
    "yearDimension": {
        "widget": Dropdown(options=["year", "decade", "century"], value="year", description="Year Dim:")
    },
    "year1": {"widget": IntSlider(value=2000, min=1900, max=2025, description="From:")},
    "year2": {"widget": IntSlider(value=2010, min=1900, max=2025, description="To:")},
}

make_interactive_query(engine, text(query_1), param_config_1, plot_top_titles_overall, title="Top Titles by Rating")


VBox(children=(HBox(children=(IntSlider(value=5, description='Top N:', max=20, min=1), SelectMultiple(descript…

### **Analytical Report 2: *Genre-Centered Report***

#### **Query Result**

In [37]:
params_2 = {
    "topN": 10,
    "titleTypes": "videoGame",
    "isAdult": 0,
    "genreList": "Animation,Action,Crime",
    "votes": 500,
    "yearDimension": "decade",
	"yearList": "1980,2020",
    "year1": None,
    "year2": None
}

query_2 = """
WITH baseTable AS (
	SELECT f.tconst, t.primaryTitle, t.titleType, f.isAdult, f.averageRating, f.numVotes,
		(
		  SELECT GROUP_CONCAT(DISTINCT g.genre ORDER BY g.genre SEPARATOR ', ')
		  FROM BridgeTitleGenre btg
		  JOIN DimGenre g ON btg.genreKey = g.genreKey
		  WHERE btg.tconst = f.tconst
		) AS genres,
		CASE
		  WHEN :yearDimension = 'year' THEN ry.releaseYear
		  WHEN :yearDimension = 'decade' THEN ry.decade
			ELSE ry.century
		END AS period
	FROM FactTitle f
    JOIN DimTitle t ON t.titleKey = f.titleKey
    JOIN DimReleaseYear ry ON ry.releaseYearKey = f.releaseYearKey
    WHERE 
			(:votes IS NULL OR f.numVotes >= :votes)
		AND (:isAdult IS NULL OR f.isAdult = :isAdult)
		AND (:titleTypes IS NULL OR FIND_IN_SET(t.titleType, :titleTypes) > 0)
"""

year_range_query_2 = """
		-- If range of years, retain this block
		AND (:year1 IS NULL OR (
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END) >= :year1)
			AND (:year2 IS NULL OR (
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END) <= :year2)
"""
year_select_query_2 = """
		-- If selection of years, retain this block
        AND (:yearList IS NULL OR FIND_IN_SET(CAST(
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END AS CHAR), :yearList) > 0)
"""
remaining_query_2 = """
), ranked AS (
	SELECT g.genre, b.primaryTitle, b.titleType, b.isAdult, b.genres, b.averageRating, b.numVotes, b.period,
		ROW_NUMBER() OVER (
		  PARTITION BY g.genre
		  ORDER BY b.averageRating DESC, b.numVotes DESC
		) AS rn
	FROM baseTable b
    JOIN BridgeTitleGenre btg ON btg.tconst = b.tconst
    JOIN DimGenre g ON btg.genreKey = g.genreKey
    WHERE (:genreList IS NULL OR FIND_IN_SET(g.genre, :genreList) > 0)
)
SELECT genre, primaryTitle, titleType, genres, isAdult, averageRating, numVotes, period
FROM ranked
WHERE rn <= :topN 
ORDER BY genre, rn;
"""
merged_query_2 = query_2

if params_2["year1"] != None and params_2["year2"] != None:
    merged_query_2 += year_range_query_2
elif params_2["yearList"] != None:
	merged_query_2 += year_select_query_2

merged_query_2 += remaining_query_2


In [None]:

result_2 = run_sql(merged_query_2, engine, params=params_2)

result_2.head()

Unnamed: 0,genre,primaryTitle,titleType,genres,isAdult,averageRating,numVotes,period
0,Action,The Last of Us: Part I,videoGame,"Action, Adventure, Drama",0,9.7,12519,2020
1,Action,Baldur's Gate III,videoGame,"Action, Adventure, Drama",0,9.6,8599,2020
2,Action,Clair Obscur: Expedition 33,videoGame,"Action, Adventure, Fantasy",0,9.6,5673,2020
3,Action,Mass Effect: Legendary Edition,videoGame,"Action, Adventure, Drama",0,9.6,3677,2020
4,Action,God of War Ragnarök: Valhalla,videoGame,"Action, Adventure, Drama",0,9.5,23496,2020
5,Action,The Legend of Zelda: Tears of the Kingdom,videoGame,"Action, Adventure, Fantasy",0,9.4,3894,2020
6,Action,Death Stranding 2: On the Beach,videoGame,"Action, Adventure, Drama",0,9.4,2070,2020
7,Action,Kingdom Come: Deliverance II,videoGame,"Action, Adventure, Drama",0,9.4,1624,2020
8,Action,Elden Ring,videoGame,"Action, Adventure, Fantasy",0,9.3,14727,2020
9,Action,Resident Evil 4,videoGame,"Action, Adventure, Drama",0,9.3,8782,2020


#### **Interactive Visualization**

In [None]:
def plot_top_titles_by_genre(df, top_n=10):
    # Keep only top N titles per genre (based on rating and votes)
    df_top = (
        df.sort_values(["genre", "averageRating", "numVotes"], ascending=[True, False, False])
        .groupby("genre")
        .head(top_n)
        .reset_index(drop=True)
    )

    # Determine reasonable number of facet columns based on genre count
    num_genres = df_top["genre"].nunique()
    facet_wrap = min(num_genres, 3)  # max 3 per row for readability

    # Create the bar chart with facets
    fig = px.bar(
        df_top,
        x="primaryTitle",
        y="averageRating",
        color="genre",
        facet_col="genre",
        facet_col_wrap=facet_wrap,
        hover_data=["titleType", "numVotes", "period"],
        height=max(500, 250 * ((num_genres // facet_wrap) + 1)),  # dynamic height
    )

    # Layout improvements
    fig.update_layout(
        title="Top Titles by Genre and Rating",
        margin=dict(l=40, r=40, t=80, b=200),
        yaxis_title="Average Rating",
        xaxis_title="Title",
        showlegend=True,
        autosize=True,
    )

    # Rotate x-axis labels for readability
    fig.update_xaxes(tickangle=45, tickfont=dict(size=9))

    # Improve y-axis scaling and tick spacing for visibility
    fig.update_yaxes(
        range=[df_top["averageRating"].min() - 0.2, 10],
        dtick=0.2,
        title_standoff=10,
    )

    return fig


param_config_2 = {
    "topN": {"widget": IntSlider(value=10, min=1, max=30, description="Top N:")},
    "titleTypes": {
        "widget": SelectMultiple(
            options=["movie", "tvSeries", "short", "videoGame"],
            value=["videoGame"],
            description="Title Types:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "isAdult": {
        "widget": ToggleButtons(
            options=[("All", None), ("Non-Adult", 0), ("Adult", 1)],
            description="Adult:"
        )
    },
    "genreList": {
        "widget": SelectMultiple(
            options=["Action", "Adventure", "Animation", "Crime", "Drama", "Comedy"],
            value=["Animation", "Action", "Crime"],
            description="Genres:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "votes": {"widget": IntSlider(value=500, min=0, max=100000, step=500, description="Min Votes:")},
    "yearDimension": {
        "widget": Dropdown(options=["year", "decade", "century"], value="decade", description="Year Dim:")
    },
    "yearList": {
        "widget": SelectMultiple(
            options=[str(y) for y in range(1920, 2030, 10)],
            value=["1980", "2020"],
            description="Decades:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "year1": {"widget": Dropdown(options=[None], value=None, description="From:")},  # inactive in this query
    "year2": {"widget": Dropdown(options=[None], value=None, description="To:")}
}

make_interactive_query(engine, text(merged_query_2), param_config_2, plot_top_titles_by_genre, title="Top Titles by Genre")

VBox(children=(HBox(children=(IntSlider(value=10, description='Top N:', max=30, min=1), SelectMultiple(descrip…

### **Analytical Report 3: *Film Type-Centered Report***

#### **Query Result**

In [9]:
params_3 = {
    "topN": 3, 
    "genreList": 'Horror',
    "isAdult": 0,
    "yearDimension": 'year',
    "yearList": '2000,2005,2010,2015,2020', 
    "year1": None,
    "year2": None,
    "votes": 800,
    "titleTypes": 'Movie,tvSeries,Short'
}

query_3 = """
WITH baseTable AS (
	SELECT t.primaryTitle, t.titleType, f.isAdult, f.averageRating, f.numVotes,
		(
		  SELECT GROUP_CONCAT(DISTINCT g.genre ORDER BY g.genre SEPARATOR ', ')
		  FROM BridgeTitleGenre btg
		  JOIN DimGenre g ON btg.genreKey = g.genreKey
		  WHERE btg.tconst = f.tconst
		) AS genres,
		CASE
		  WHEN :yearDimension = 'year' THEN ry.releaseYear
		  WHEN :yearDimension = 'decade' THEN ry.decade
			ELSE ry.century
		END AS period
	FROM FactTitle f
    JOIN DimTitle t ON t.titleKey = f.titleKey
    JOIN DimReleaseYear ry ON ry.releaseYearKey = f.releaseYearKey
    WHERE 
			(:votes IS NULL OR f.numVotes >= :votes)
		AND (:isAdult IS NULL OR f.isAdult = :isAdult)
		AND (:titleTypes IS NULL OR FIND_IN_SET(t.titleType, :titleTypes) > 0)
		AND EXISTS (
			SELECT 1
            FROM BridgeTitleGenre btg
            JOIN DimGenre g on g.genreKey = btg.genreKey
            WHERE btg.tconst = f.tconst
				AND (:genreList IS NULL OR FIND_IN_SET(g.genre, :genreList) > 0)
		)
"""
year_range_query_3 = """
		-- If range of years, retain this block
		AND (:year1 IS NULL OR (
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END) >= :year1)
			AND (:year2 IS NULL OR (
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END) <= :year2)
"""
year_select_query_3 = """
		-- If selection of years, retain this block
        AND (:yearList IS NULL OR FIND_IN_SET(CAST(
			CASE
				WHEN :yearDimension = 'year' THEN ry.releaseYear
                WHEN :yearDimension = 'decade' THEN ry.decade
                ELSE ry.century
			END AS CHAR), :yearList) > 0)
"""
remaining_query_3 = """
), ranked AS (
	SELECT b.titleType, b.primaryTitle, b.isAdult, b.genres, b.averageRating, b.numVotes, b.period,
		ROW_NUMBER() OVER (
		  PARTITION BY b.titleType
		  ORDER BY b.averageRating DESC, b.numVotes DESC
		) AS rn
	FROM baseTable b
)
SELECT titleType, primaryTitle, genres, isAdult, averageRating, numVotes, period
FROM ranked
WHERE rn <= :topN 
ORDER BY titleType, rn;
"""

merged_query_3 = query_3

if params_3["year1"] != None and params_3["year2"] != None:
    merged_query_3 += year_range_query_3
elif params_3["yearList"] != None:
	merged_query_3 += year_select_query_3

merged_query_3 += remaining_query_3


In [None]:

result_3 = run_sql(merged_query_3, engine, params=params_3)

result_3

#### **Interactive Visualization**

In [None]:
def plot_top_titles_by_type(df):
    """
    Plots top titles (movies/TV series/etc.) faceted by titleType, colored by genre.
    """
    # Sort by averageRating and votes for clean ordering
    df = df.sort_values(["titleType", "averageRating"], ascending=[True, False])

    fig = px.bar(
        df,
        x="primaryTitle",
        y="averageRating",
        color="genres",  # Use concatenated genres
        facet_col="titleType",
        hover_data=["numVotes", "genres", "period", "isAdult"],
        height=600
    )

    fig.update_layout(
        autosize=True,
        margin=dict(l=40, r=40, t=60, b=160),
        title="Top Titles by Type and Rating",
        xaxis_title="Title",
        yaxis_title="Average Rating"
    )

    # Rotate x-axis labels for readability
    fig.update_xaxes(tickangle=45)

    return fig

# --- 2. Parameter configuration similar to your previous one ---
param_config_3 = {
    "topN": {"widget": IntSlider(value=3, min=1, max=20, description="Top N:")},
    "titleTypes": {
        "widget": SelectMultiple(
            options=["Movie", "tvSeries", "Short"],
            value=["Movie", "tvSeries", "Short"],
            description="Title Types:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "isAdult": {
        "widget": ToggleButtons(
            options=[("All", None), ("Non-Adult", 0), ("Adult", 1)],
            description="Adult:"
        )
    },
    "genreList": {
        "widget": SelectMultiple(
            options=["Action", "Adventure", "Animation", "Crime", "Drama", "Comedy", "Horror"],
            value=["Horror"],
            description="Genres:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "votes": {"widget": IntSlider(value=800, min=0, max=50000, step=100, description="Min Votes:")},
    "yearDimension": {
        "widget": Dropdown(options=["year", "decade", "century"], value="year", description="Year Dim:")
    },
    "yearList": {
        "widget": SelectMultiple(
            options=[str(y) for y in range(2000, 2025, 5)],
            value=["2000", "2005", "2010", "2015", "2020"],
            description="Years:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "year1": {"widget": Dropdown(options=[None], value=None, description="From:")},
    "year2": {"widget": Dropdown(options=[None], value=None, description="To:")}
}

# --- 3. Make the plot interactive ---
make_interactive_query( engine, text(merged_query_3), param_config_3, plot_top_titles_by_type, title="Top Titles by Type and Rating")


VBox(children=(HBox(children=(IntSlider(value=3, description='Top N:', max=20, min=1), SelectMultiple(descript…

### **Analytical Report 4: *Genre & Release Year Cross-Dimensional Report***

#### **Query Result**

In [25]:
params_4 = {
    "genreList": "Action,Adventure",
    "year1": 2000,
    "year2": 2020,
    "yearDimension": "year",
    "yearList": None 
}

query_4 = """
SELECT
    CASE
        WHEN :yearDimension = 'year' THEN ry.releaseYear
        WHEN :yearDimension = 'decade' THEN ry.decade
        ELSE ry.century
    END AS period,
    g.genre,
    ROUND(AVG(f.averageRating), 2) AS avgRating
FROM FactTitle f
JOIN BridgeTitleGenre btg 
  ON btg.tconst = f.tconst
JOIN DimGenre g 
  ON g.genreKey = btg.genreKey
JOIN DimReleaseYear ry 
  ON ry.releaseYearKey = f.releaseYearKey
JOIN DimTitle t 
  ON t.titleKey = f.titleKey
WHERE
  (:genreList IS NULL OR FIND_IN_SET(g.genre, :genreList) > 0)
"""

year_range_query_4 = """
  AND (:year1 IS NULL OR (
    CASE
      WHEN :yearDimension = 'year' THEN ry.releaseYear
      WHEN :yearDimension = 'decade' THEN ry.decade
      ELSE ry.century
    END) >= :year1)
  AND (:year2 IS NULL OR (
    CASE
      WHEN :yearDimension = 'year' THEN ry.releaseYear
      WHEN :yearDimension = 'decade' THEN ry.decade
      ELSE ry.century
    END) <= :year2)
"""

year_select_query_4 = """
  AND (:yearList IS NULL OR FIND_IN_SET(CAST(
    CASE
      WHEN :yearDimension = 'year' THEN ry.releaseYear
      WHEN :yearDimension = 'decade' THEN ry.decade
      ELSE ry.century
    END AS CHAR), :yearList) > 0)
"""

remaining_query_4 = """
    GROUP BY period, g.genre
    ORDER BY period, g.genre;
"""

merged_query_4 = query_4

if params_4["year1"] is not None and params_4["year2"] is not None:
    merged_query_4 += year_range_query_4
elif params_4["yearList"] is not None:
    merged_query_4 += year_select_query_4

merged_query_4 += remaining_query_4


In [13]:
result_4 = run_sql(merged_query_4, engine, params=params_4)

result_4.head()

Unnamed: 0,period,genre,avgRating
0,2000,Action,6.91
1,2000,Adventure,6.99
2,2001,Action,6.96
3,2001,Adventure,7.1
4,2002,Action,6.96


#### **Interactive Visualizations**

In [None]:
param_config_4 = {
    "genreList": {
        "widget": SelectMultiple(
            options=["Action", "Adventure", "Animation", "Crime", "Drama", "Comedy", "Horror"],
            value=["Action", "Adventure"],
            description="Genres:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "yearDimension": {
        "widget": Dropdown(
            options=["year", "decade", "century"],
            value="year",
            description="Year Dim:"
        )
    },
    "yearList": {
        "widget": SelectMultiple(
            options=[str(y) for y in range(2000, 2025)],
            value=[],
            description="Years:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "year1": {
        "widget": Dropdown(
            options=[None] + list(range(2000, 2025)),
            value=2000,
            description="From:"
        )
    },
    "year2": {
        "widget": Dropdown(
            options=[None] + list(range(2000, 2025)),
            value=2020,
            description="To:"
        )
    }
}

def plot_avg_rating_trend_with_pivot(df):
    """
    Analytical Report #4 visualization with:
      - Line chart (trend)
      - Heatmap (period × genre)
      - Scrollable pivot table (summary)
    Returns a single Plotly Figure (for use with make_interactive_query).
    """
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No data found for the current selection.",
            showarrow=False, x=0.5, y=0.5, font=dict(size=14)
        )
        return fig

    df['period'] = df['period'].astype(str)
    df_pivot = df.pivot(index='period', columns='genre', values='avgRating').fillna(0)

    # Create subplots with proper height balance
    fig = make_subplots(
        rows=3, cols=1,
        row_heights=[0.45, 0.35, 0.20],
        vertical_spacing=0.08,
        subplot_titles=(
            "Average Rating Trend by Genre",
            "Average Rating Heatmap (Period × Genre)",
            "Pivot Table Summary"
        ),
        specs=[[{"type": "xy"}],
               [{"type": "heatmap"}],
               [{"type": "domain"}]]  # table uses domain for flexible sizing
    )

    # --- 1️⃣ Line Chart ---
    for genre, d in df.groupby("genre"):
        fig.add_trace(
            go.Scatter(
                x=d["period"], y=d["avgRating"],
                mode="lines+markers",
                name=genre,
                line=dict(width=2),
                hovertemplate=f"Genre: {genre}<br>Period: %{{x}}<br>Avg Rating: %{{y}}"
            ),
            row=1, col=1
        )

    # --- 2️⃣ Heatmap ---
    fig.add_trace(
        go.Heatmap(
            z=df_pivot.values,
            x=df_pivot.columns,
            y=df_pivot.index,
            colorscale="Viridis",
            colorbar=dict(title="Avg Rating", len=0.5, y=0.65),
            hoverongaps=False
        ),
        row=2, col=1
    )

    # --- 3️⃣ Scrollable Table ---
    max_rows_display = 15
    num_rows = len(df_pivot)
    shown_rows = df_pivot.iloc[:max_rows_display]

    # Add a note if truncated
    caption = ""
    if num_rows > max_rows_display:
        caption = f"Showing first {max_rows_display} of {num_rows} periods"

    fig.add_trace(
        go.Table(
            header=dict(
                values=["Period"] + list(df_pivot.columns),
                fill_color="#B2EBF2",
                align="center",
                font=dict(color="black", size=12)
            ),
            cells=dict(
                values=[shown_rows.index] + [shown_rows[col] for col in shown_rows.columns],
                fill_color="#E0F7FA",
                align="center",
                font=dict(size=11)
            ),
            domain=dict(x=[0, 1], y=[0, 0.15])  # restrict table height
        ),
        row=3, col=1
    )

    # --- Layout and titles ---
    fig.update_layout(
        height=1300,
        title="Analytical Report #4: Genre & Release Year Trend",
        title_x=0.5,
        title_font=dict(size=20),
        margin=dict(l=50, r=50, t=100, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.05, xanchor="center", x=0.5),
        plot_bgcolor="white",
        annotations=[
            dict(
                text=caption,
                x=0.5, y=0.02,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=12, color="gray")
            )
        ] if caption else []
    )

    fig.update_yaxes(title_text="Average Rating", row=1, col=1)
    fig.update_xaxes(title_text="Period", row=1, col=1)
    fig.update_xaxes(title_text="Genre", row=2, col=1)
    fig.update_yaxes(title_text="Period", row=2, col=1)

    return fig


make_interactive_query(
    engine,
    text(merged_query_4),
    param_config_4,
    plot_avg_rating_trend_with_pivot,
    title="Analytical Report #4: Genre & Release Year Trend"
)

VBox(children=(HBox(children=(SelectMultiple(description='Genres:', index=(0, 1), options=('Action', 'Adventur…

### **Analytical Report 5: *Title Type & Release Year Cross-Dimensional Report***

#### **Query Result**

In [30]:
params_5 = {
    "titleTypes": "movie,tvSeries",
    "year1": 1980,
    "year2": 2020,
    "yearList": None,       
    "yearDimension": "decade"
}

query_5 = """
SELECT
  CASE
    WHEN :yearDimension = 'year' THEN ry.releaseYear
    WHEN :yearDimension = 'decade' THEN ry.decade
    ELSE ry.century
  END AS period, 
  t.titleType, 
  ROUND(AVG(f.averageRating), 2) AS avgRating
FROM FactTitle f
JOIN DimReleaseYear ry 
  ON ry.releaseYearKey = f.releaseYearKey
JOIN DimTitle t 
  ON t.titleKey = f.titleKey
WHERE
  (:titleTypes IS NULL OR FIND_IN_SET(t.titleType, :titleTypes) > 0)
"""

year_range_query_5 = """
  AND (:year1 IS NULL OR (
    CASE
      WHEN :yearDimension = 'year' THEN ry.releaseYear
      WHEN :yearDimension = 'decade' THEN ry.decade
      ELSE ry.century
    END) >= :year1)
  AND (:year2 IS NULL OR (
    CASE
      WHEN :yearDimension = 'year' THEN ry.releaseYear
      WHEN :yearDimension = 'decade' THEN ry.decade
      ELSE ry.century
    END) <= :year2)
"""

year_select_query_5 = """
  AND (:yearList IS NULL OR FIND_IN_SET(CAST(
    CASE
      WHEN :yearDimension = 'year' THEN ry.releaseYear
      WHEN :yearDimension = 'decade' THEN ry.decade
      ELSE ry.century
    END AS CHAR), :yearList) > 0)
"""

remaining_query_5 = """
GROUP BY period, t.titleType
ORDER BY period, t.titleType;
"""

merged_query_5 = query_5

if params_5["year1"] is not None and params_5["year2"] is not None:
    merged_query_5 += year_range_query_5
elif params_5["yearList"] is not None:
    merged_query_5 += year_select_query_5

merged_query_5 += remaining_query_5


In [28]:

result_5 = run_sql(merged_query_5, engine, params=params_5)

result_5.head()


Unnamed: 0,period,titleType,avgRating
0,1980,movie,5.88
1,1980,tvSeries,6.99
2,1990,movie,6.01
3,1990,tvSeries,6.78
4,2000,movie,6.19


#### **Interactive Visualization**

In [None]:
param_config_5 = {
    "titleTypes": {
        "widget": SelectMultiple(
            options=["movie", "tvSeries", "short", "videoGame"],
            value=["movie", "tvSeries"],
            description="Title Types:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "yearDimension": {
        "widget": Dropdown(
            options=["year", "decade", "century"],
            value="decade",
            description="Year Dim:"
        )
    },
    "year1": {"widget": IntSlider(value=1980, min=1900, max=2020, step=10, description="From:")},
    "year2": {"widget": IntSlider(value=2020, min=1900, max=2020, step=10, description="To:")},
    "yearList": {"widget": Dropdown(options=[None], value=None, description="Years (optional):")},
}

def plot_avg_rating_by_title_type(df):
    """
    Analytical Report #5 visualization:
      - Line chart of avg rating trend per title type
      - Heatmap pivot of period × titleType
      - Scrollable summary table
    Returns a single Plotly Figure (for make_interactive_query).
    """
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No data found for the current selection.",
            showarrow=False, x=0.5, y=0.5, font=dict(size=14)
        )
        return fig

    df["period"] = df["period"].astype(str)
    df_pivot = df.pivot(index="period", columns="titleType", values="avgRating").fillna(0)

    fig = make_subplots(
        rows=3, cols=1,
        row_heights=[0.45, 0.35, 0.20],
        vertical_spacing=0.08,
        subplot_titles=(
            "Average Rating Trend by Title Type",
            "Average Rating Heatmap (Period × Title Type)",
            "Pivot Table Summary"
        ),
        specs=[[{"type": "xy"}],
               [{"type": "heatmap"}],
               [{"type": "domain"}]]
    )

    # --- Line Chart ---
    for ttype, d in df.groupby("titleType"):
        fig.add_trace(
            go.Scatter(
                x=d["period"],
                y=d["avgRating"],
                mode="lines+markers",
                name=ttype,
                line=dict(width=2),
                hovertemplate=f"Type: {ttype}<br>Period: %{{x}}<br>Avg Rating: %{{y}}"
            ),
            row=1, col=1
        )

    # --- Heatmap ---
    fig.add_trace(
        go.Heatmap(
            z=df_pivot.values,
            x=df_pivot.columns,
            y=df_pivot.index,
            colorscale="Viridis",
            colorbar=dict(title="Avg Rating", len=0.5, y=0.65),
            hoverongaps=False
        ),
        row=2, col=1
    )

    max_rows_display = 15
    num_rows = len(df_pivot)
    shown_rows = df_pivot.iloc[:max_rows_display]
    caption = ""
    if num_rows > max_rows_display:
        caption = f"Showing first {max_rows_display} of {num_rows} periods"

    fig.add_trace(
        go.Table(
            header=dict(
                values=["Period"] + list(df_pivot.columns),
                fill_color="#B2EBF2",
                align="center",
                font=dict(color="black", size=12)
            ),
            cells=dict(
                values=[shown_rows.index] + [shown_rows[col] for col in shown_rows.columns],
                fill_color="#E0F7FA",
                align="center",
                font=dict(size=11)
            ),
            domain=dict(x=[0, 1], y=[0, 0.15])
        ),
        row=3, col=1
    )

    fig.update_layout(
        height=1300,
        title="Analytical Report #5: Title Type & Time Trend Analysis",
        title_x=0.5,
        title_font=dict(size=20),
        margin=dict(l=50, r=50, t=100, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.05, xanchor="center", x=0.5),
        plot_bgcolor="white",
        annotations=[
            dict(
                text=caption,
                x=0.5, y=0.02,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=12, color="gray")
            )
        ] if caption else []
    )

    fig.update_yaxes(title_text="Average Rating", row=1, col=1)
    fig.update_xaxes(title_text="Period", row=1, col=1)
    fig.update_xaxes(title_text="Title Type", row=2, col=1)
    fig.update_yaxes(title_text="Period", row=2, col=1)

    return fig

make_interactive_query(
    engine,
    text(merged_query_5),
    param_config_5,
    plot_avg_rating_by_title_type,
    title="Analytical Report #5: Title Type & Time Trend"
)

VBox(children=(HBox(children=(SelectMultiple(description='Title Types:', index=(0, 1), options=('movie', 'tvSe…

### **Analytical Report 6: *Title Type & Genre Cross-Dimensional Report***

#### **Query Result**

In [33]:
params_6 = {
    "titleTypes": "Movie,tvSeries",
    "genreList": "Action,Adventure"
}

query_6 = """
SELECT 
  t.titleType, 
  g.genre, 
  ROUND(AVG(f.averageRating), 2) AS avgRating
FROM FactTitle f
JOIN BridgeTitleGenre btg 
  ON btg.tconst = f.tconst
JOIN DimGenre g 
  ON g.genreKey = btg.genreKey
JOIN DimReleaseYear ry 
  ON ry.releaseYearKey = f.releaseYearKey
JOIN DimTitle t 
  ON t.titleKey = f.titleKey
WHERE
  (:titleTypes IS NULL OR FIND_IN_SET(t.titleType, :titleTypes) > 0)
  AND (:genreList IS NULL OR FIND_IN_SET(g.genre, :genreList) > 0)
GROUP BY t.titleType, g.genre
ORDER BY t.titleType, g.genre;
"""


In [None]:

result_6 = run_sql(query_6, engine, params=params_6)

result_6.head()


#### **Interactive Visualization**

In [None]:
param_config_6 = {
    "titleTypes": {
        "widget": SelectMultiple(
            options=["movie", "tvSeries", "short", "videoGame"],
            value=["movie", "tvSeries"],
            description="Title Types:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "genreList": {
        "widget": SelectMultiple(
            options=["Action", "Adventure", "Comedy", "Drama", "Crime", "Horror"],
            value=["Action", "Adventure"],
            description="Genres:"
        ),
        "transform": lambda v: ",".join(v)
    }
}

def plot_avg_rating_by_type_and_genre(df):
    """
    Analytical Report #6 visualization:
      - Line chart comparing average ratings per genre by title type
      - Heatmap of titleType × genre average ratings
      - Scrollable pivot table summary
    Returns a single Plotly Figure for make_interactive_query().
    """
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No data found for the current selection.",
            showarrow=False, x=0.5, y=0.5, font=dict(size=14)
        )
        return fig

    df_pivot = df.pivot(index="genre", columns="titleType", values="avgRating").fillna(0)

    # --- Set up subplot layout
    fig = make_subplots(
        rows=3, cols=1,
        row_heights=[0.45, 0.35, 0.20],
        vertical_spacing=0.08,
        subplot_titles=(
            "Average Rating by Genre and Title Type",
            "Average Rating Heatmap (Genre × Title Type)",
            "Pivot Table Summary"
        ),
        specs=[[{"type": "xy"}],
               [{"type": "heatmap"}],
               [{"type": "domain"}]]
    )

    # --- Line Chart ---
    # Each titleType gets its own line across genres
    for ttype, d in df.groupby("titleType"):
        fig.add_trace(
            go.Scatter(
                x=d["genre"],
                y=d["avgRating"],
                mode="lines+markers",
                name=ttype,
                line=dict(width=2),
                hovertemplate=f"Type: {ttype}<br>Genre: %{{x}}<br>Avg Rating: %{{y}}"
            ),
            row=1, col=1
        )

    # --- Heatmap ---
    fig.add_trace(
        go.Heatmap(
            z=df_pivot.values,
            x=df_pivot.columns,
            y=df_pivot.index,
            colorscale="Viridis",
            colorbar=dict(title="Avg Rating", len=0.5, y=0.65),
            hoverongaps=False
        ),
        row=2, col=1
    )

    # --- Scrollable Pivot Table ---
    max_rows_display = 15
    num_rows = len(df_pivot)
    shown_rows = df_pivot.iloc[:max_rows_display]
    caption = ""
    if num_rows > max_rows_display:
        caption = f"wing first {max_rows_display} of {num_rows} genres"

    fig.add_trace(
        go.Table(
            header=dict(
                values=["Genre"] + list(df_pivot.columns),
                fill_color="#B2EBF2",
                align="center",
                font=dict(color="black", size=12)
            ),
            cells=dict(
                values=[shown_rows.index] + [shown_rows[col] for col in shown_rows.columns],
                fill_color="#E0F7FA",
                align="center",
                font=dict(size=11)
            ),
            domain=dict(x=[0, 1], y=[0, 0.15])
        ),
        row=3, col=1
    )

    # --- Layout & Titles ---
    fig.update_layout(
        height=1200,
        title="Analytical Report #6: Title Type × Genre Average Rating Analysis",
        title_x=0.5,
        title_font=dict(size=20),
        margin=dict(l=50, r=50, t=100, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.05, xanchor="center", x=0.5),
        plot_bgcolor="white",
        annotations=[
            dict(
                text=caption,
                x=0.5, y=0.02,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=12, color="gray")
            )
        ] if caption else []
    )

    # --- Axis Labels ---
    fig.update_yaxes(title_text="Average Rating", row=1, col=1)
    fig.update_xaxes(title_text="Genre", row=1, col=1)
    fig.update_xaxes(title_text="Title Type", row=2, col=1)
    fig.update_yaxes(title_text="Genre", row=2, col=1)

    return fig


make_interactive_query(
    engine,
    text(query_6),
    param_config_6,
    plot_avg_rating_by_type_and_genre,
    title="Analytical Report #6: Title Type & Genre Rating Comparison"
)


VBox(children=(HBox(children=(SelectMultiple(description='Title Types:', index=(0, 1), options=('movie', 'tvSe…

### **Analytical Report 7: *Series & Season Cross-Dimensional Report***

#### **Query Result**

In [39]:
params_7 = {
    "seriesTitles": "Breaking Bad,Friends",
    "seasonNumbers": "1,2,3,4,5,6,7,8,9,10"
}

query_7 = """
SELECT 
  t.primaryTitle, 
  s.seasonNumber, 
  ROUND(s.seasonAvgRating, 2) AS avgRatings
FROM (
  SELECT 
    e.parentTconst, 
    e.seasonNumber, 
    AVG(f.averageRating) AS seasonAvgRating
  FROM FactTitle f
  JOIN DimEpisode e
    ON e.episodeKey = f.episodeKey
  GROUP BY e.parentTconst, e.seasonNumber
) AS s
JOIN FactTitle f2
  ON f2.tconst = s.parentTconst
JOIN DimTitle t 
  ON t.titleKey = f2.titleKey
WHERE 
  (:seriesTitles IS NULL OR FIND_IN_SET(t.primaryTitle, :seriesTitles) > 0)
  AND (:seasonNumbers IS NULL OR FIND_IN_SET(CAST(s.seasonNumber AS CHAR), :seasonNumbers) > 0)
ORDER BY t.primaryTitle, s.seasonNumber;
"""


In [None]:

result_7 = run_sql(query_7, engine, params=params_7)

result_7.head()

#### **Interactive Visualization**

In [50]:
param_config_7 = {
    "seriesTitles": {
        "widget": SelectMultiple(
            options=["Breaking Bad", "Friends", "The Office", "Game of Thrones"],
            value=["Breaking Bad", "Friends"],
            description="Series:"
        ),
        "transform": lambda v: ",".join(v)
    },
    "seasonNumbers": {
        "widget": SelectMultiple(
            options=[str(i) for i in range(1, 11)],
            value=[str(i) for i in range(1, 6)],
            description="Seasons:"
        ),
        "transform": lambda v: ",".join(v)
    }
}


def plot_series_season_trends(df):
    """
    Analytical Report #7 visualization:
      - Line chart: average rating trend per season per series
      - Heatmap: (Series × Season) matrix of average ratings
      - Pivot table: scrollable summary
    Returns a Plotly Figure (for make_interactive_query).
    """
    if df.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No data found for the selected series/seasons.",
            showarrow=False, x=0.5, y=0.5, font=dict(size=14)
        )
        return fig

    # Ensure numeric season ordering
    df["seasonNumber"] = df["seasonNumber"].astype(int)
    df = df.sort_values(["primaryTitle", "seasonNumber"])

    # Pivot table
    df_pivot = df.pivot(index="seasonNumber", columns="primaryTitle", values="avgRatings").fillna(0)

    # --- Subplots layout
    fig = make_subplots(
        rows=3, cols=1,
        row_heights=[0.45, 0.35, 0.20],
        vertical_spacing=0.08,
        subplot_titles=(
            "Average Rating Trend per Season (by Series)",
            "Average Rating Heatmap (Series × Season)",
            "Pivot Table Summary"
        ),
        specs=[[{"type": "xy"}],
               [{"type": "heatmap"}],
               [{"type": "domain"}]]
    )

    # --- Line Chart ---
    for series, d in df.groupby("primaryTitle"):
        fig.add_trace(
            go.Scatter(
                x=d["seasonNumber"],
                y=d["avgRatings"],
                mode="lines+markers",
                name=series,
                line=dict(width=2),
                hovertemplate=f"Series: {series}<br>Season %{{x}}<br>Avg Rating: %{{y}}"
            ),
            row=1, col=1
        )

    # --- Heatmap ---
    fig.add_trace(
        go.Heatmap(
            z=df_pivot.values,
            x=df_pivot.columns,
            y=df_pivot.index,
            colorscale="Viridis",
            colorbar=dict(title="Avg Rating", len=0.5, y=0.65),
            hoverongaps=False
        ),
        row=2, col=1
    )

    # --- Scrollable Pivot Table ---
    max_rows_display = 12
    num_rows = len(df_pivot)
    shown_rows = df_pivot.iloc[:max_rows_display]
    caption = ""
    if num_rows > max_rows_display:
        caption = f"Showing first {max_rows_display} of {num_rows} seasons"

    fig.add_trace(
        go.Table(
            header=dict(
                values=["Season"] + list(df_pivot.columns),
                fill_color="#B2DFDB",
                align="center",
                font=dict(color="black", size=12)
            ),
            cells=dict(
                values=[shown_rows.index] + [shown_rows[col] for col in shown_rows.columns],
                fill_color="#E0F2F1",
                align="center",
                font=dict(size=11)
            ),
            domain=dict(x=[0, 1], y=[0, 0.15])
        ),
        row=3, col=1
    )

    # --- Layout ---
    fig.update_layout(
        height=1100,
        title="Analytical Report #7: Series & Season Average Rating Trends",
        title_x=0.5,
        title_font=dict(size=20),
        margin=dict(l=50, r=50, t=100, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.05, xanchor="center", x=0.5),
        plot_bgcolor="white",
        annotations=[
            dict(
                text=caption,
                x=0.5, y=0.02,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=12, color="gray")
            )
        ] if caption else []
    )

    # --- Axis labels ---
    fig.update_yaxes(title_text="Average Rating", row=1, col=1)
    fig.update_xaxes(title_text="Season Number", row=1, col=1)
    fig.update_yaxes(title_text="Season Number", row=2, col=1)
    fig.update_xaxes(title_text="Series Title", row=2, col=1)

    return fig


make_interactive_query(
    engine,
    text(query_7),
    param_config_7,
    plot_series_season_trends,
    title="Analytical Report #7: Series & Season Average Rating Trends"
)



VBox(children=(HBox(children=(SelectMultiple(description='Series:', index=(0, 1), options=('Breaking Bad', 'Fr…

# Optimized Queries

## Adding Indices

In [None]:
index_query = """
CREATE INDEX idx_fact_genreKey ON FactTitle(genreKey);
CREATE INDEX idx_fact_titleKey ON FactTitle(titleKey);
CREATE INDEX idx_fact_avgrating ON FactTitle(averageRating);
CREATE INDEX idx_fact_tconst ON FactTitle(tconst);
CREATE INDEX idx_fact_episodeKey ON FactTitle(episodeKey);

CREATE INDEX idx_fact_releaseYearKey ON FactTitle(releaseYearKey);
"""

