<span style="font-size: 36px;">W4111_Spring_2025_002 - Introduction to Databases:<br>Non-Programming Track Project<br>Sprint 1

# Overview

# Initialization

## General Python Packages

In [None]:
import copy

In [None]:
import json

In [None]:
import pandas

In [None]:
import numpy

In [None]:
import pymysql
import sqlalchemy

In [None]:
import matplotlib

In [None]:
%load_ext sql

## ipython-sql

In [None]:
# This is really annoying. An update of SQLAlchemy, ipython-sql or pymysql seems to have broken the SQL Magic.
# This is a temporary fix/hack.
#
%config SqlMagic.style = '_DEPRECATED_DEFAULT'

In [None]:
# Make sure that you set these values to the correct values for your installation and 
# configuration of MySQL
#
db_user = "root"
db_password = "dbuserdbuser"

In [None]:
# Create the URL for connecting to the database.
# Do not worry about the local_infile=1, I did that for wizard reasons that you should not have to use.
#
db_url = f"mysql+pymysql://{db_user}:{db_password}@localhost?local_infile=1"

In [None]:
# Initialize ipython-sql
#
%sql $db_url

In [None]:
# Your answer will be different based on the databases that you have created on your local MySQL instance.
#
%sql use db_book;
%sql show tables;

## PyMySQL

In [None]:
default_mysql_conn = pymysql.connect(
    user=db_user,
    password=db_password,
    host="localhost",
    port=3306,
    cursorclass=pymysql.cursors.DictCursor,
    autocommit=True
)

In [None]:
cur = default_mysql_conn.cursor()

result = cur.execute("select * from db_book.student where dept_name='Comp. Sci.';");
result = cur.fetchall()
result_df = pandas.DataFrame(result)
result_df

In [None]:
from sqlalchemy import create_engine
default_engine = create_engine(db_url)

In [None]:
result_df = pandas.read_sql(
    "select * from db_book.student where dept_name='Comp. Sci.'", con=default_engine
)
result_df

# Load the Data

## Setup Project Database/Schema

<span style="color: red; font-size: 24px;"><b>Warning:<br>This will delete any previous work you have done in the project database!<br>Be sure you want to do this.</span>

In [None]:
%%sql

drop schema if exists s25_project;
create schema s25_project;

use s25_project;

## Initial Data Files

### Verify the Path and Files

In [None]:
%ls ./data

In [None]:
%ls ./data/IMDB

In [None]:
%ls ./data/GoT

### Load and Save the Data

In [None]:
base_data_directory = "./data/"

In [None]:
# The files to load and tables to create
#
files_to_load = [
    {
        "folder" : "GoT",
        "file_name": "got_episodes.csv",
        "table_name": "got_episodes"
    },
    {
        "folder" : "IMDB",
        "file_name": "name_basics.csv",
        "table_name": "name_basics"
    },
    {
        "folder" : "IMDB",
        "file_name": "name_basics_professions.csv",
        "table_name": "name_basics_professions"
    },
    {
        "folder" : "IMDB",
        "file_name": "professions.csv",
        "table_name": "professions"
    },
    {
        "folder" : "IMDB",
        "file_name": "title_basics.csv",
        "table_name": "title_basics"
    },
    {
        "folder" : "IMDB",
        "file_name": "title_principals.csv",
        "table_name": "title_principals"
    },
    {
        "folder" : "IMDB",
        "file_name": "title_ratings.csv",
        "table_name": "title_ratings"
    }
]

In [None]:
# Load a data file in CSV format and save to a databases
#
def load_and_save_csv(data_directory, file_name, schema, table_name):

    full_path = data_directory +  "/" + file_name
    print(f"Saving {full_path} to table {schema}.{table_name}")
    df = pandas.read_csv(full_path)
    df.to_sql(
        table_name, schema=schema, con=default_engine, index=False, if_exists="replace")
    print("Saved.")

In [None]:
# Load and save the files.
#
for f in files_to_load:
    data_directory = base_data_directory + f["folder"]
    file_name = f["file_name"]
    schema = "s25_project"
    table_name = f["table_name"]
    load_and_save_csv(data_directory, file_name, schema, table_name)

### Verify Data Loading

In [None]:
%sql use s25_project

In [None]:
%sql select count(*) from got_episodes

In [None]:
%sql select count(*) from name_basics

In [None]:
%sql select count(*) from title_basics

In [None]:
%sql select count(*) from title_principals

In [None]:
%sql select count(*) from professions

In [None]:
%sql select count(*) from name_basics_professions

In [None]:
%sql select count(*) from title_ratings

# Sample Project Tasks

## Non-Programming Track

In [None]:
%%sql

actors_episodes_ratings << 
with one as (
    select * from title_principals where tconst in (select imdb_tconst from got_episodes)
),
    two as (
        select * from one join title_ratings using(tconst)
    ),
    three as (
        select * from name_basics join two using(nconst)
    ),
    four as (
        select nconst, primaryName, count(*) as no_of_episodes, avg(averageRating) as average_rating
            from three group by nconst, primaryName
    )
select * from four
    where no_of_episodes >= 10
    order by average_rating desc;

In [None]:
actors_episodes_ratings_df = actors_episodes_ratings.DataFrame()

In [None]:
actors_episodes_ratings_df

In [None]:
import matplotlib.pyplot as plt

# Plotting the bar chart directly from actors_episodes_ratings_df
plt.figure(figsize=(12, 6))

plt.bar(actors_episodes_ratings_df['primaryName'], actors_episodes_ratings_df['average_rating'], color='skyblue')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Add labels and title
plt.xlabel('Actor Name')
plt.ylabel('IMDb Rating')
plt.title('IMDb Ratings by Actor')

# Avoid cutting off labels
plt.tight_layout()

# Show the plot
plt.show()


## Sample Programming Track Task

In [None]:
#
# Implement the "model" for the student data in the "db_book" database.
# Pydantic, FastAPI and OpenAPI documentation explains the concept.
#

# Import Pydantic types.
#
from typing import List, Union
from pydantic import BaseModel


# Import the definition of a Links section for a HATEOAS resource.
#
from interactive_app.application.resources.base_application_resource import Link


class Student(BaseModel):
    """
    The model/data transfer object for a single entry from the database book student table.
    """

    # Primary key.
    ID: str
    last_name: str
    department_name: str
    total_credits: int

    class Config:

        # The sample response for OpenAPI docs.
        #
        json_schema_extra = {
            "example": {
                "ID": "00128",
                "last_name": "Zhang",
                "department_name": "Comp. Sci.",
                "total_credits": 102
            }
        }


class StudentRsp(BaseModel):
    """
    A class implementing a HATEOAS pattern for return GET /student/{ID}
    """

    # A data object with the Artist information.
    data: Student

    # Links associated with the response.
    links: List[Link]

    class Config:
        json_schema_extra = {
            "example": {
                "data": {
                    "ID": "00128",
                    "last_name": "Zhang",
                    "department_name": "Comp. Sci.",
                    "total_credits": 102
                },
                "links": [
                    {"rel": "self", "href": "/api/students/00128"},

                    # TODO Change this over time to a department ID instead of name.
                    #
                    {"rel": "department", "href": "/api/departments/Comp. Sci."}
                ]
            }
        }


class StudentsRsp(BaseModel):
    """
    Return a List of artists matching a query and links.
    """
    data: List[StudentRsp]

    # TODO Add links for pagination, etc.
    #
    links: Union[List, None]



In [None]:
from interactive_app.application.services.mysql_data_service import MySQLDataService, MySQLDataServiceConfig
from interactive_app.application.resources.imdb_resources.artist_resource import ArtistResource
import json
from interactive_app.application.service_factory import ServiceFactory

service_factory = ServiceFactory()
artist_resource = service_factory.get_resource("ArtistResource")


def t1():

    a_resource = artist_resource
    result = a_resource.get_by_key("nm0000293")
    print("t1: result = \n", json.dumps(result.model_dump(), indent=2))


def t2():

    a_resource = artist_resource
    new_d = {"nconst": "xyzaa", "last_name": "Zaphod"}
    result = a_resource.create(new_d)
    print("t1: result = \n", result)



In [None]:
t1()