In [1]:
# Alyssia Marshall
# Assignment 9: Data Gathering and Warehousing
# Date: 04/22/2025

# Import required libraries
from sqlalchemy import create_engine, text  # Used to run SQL queries from Python
import mysql.connector  # For direct MySQL connection
import pandas as pd  # For handling and displaying data tables

# Establish a connection to MySQL Workbench
conn = mysql.connector.connect(
    host="localhost",  # MySQL server address 
    user="root",        # Username to access MySQL
    password="wiwikiki" # Password for MySQL 
)

# Create a cursor to send SQL commands
cursor = conn.cursor()

# Use the existing database
cursor.execute("USE ds_salaries;")  
print("Using the 'ds_salaries' database.")  # Confirm usage

# Use SQLAlchemy to create a connection engine
DATABASE_URL = "mysql+mysqlconnector://root:wiwikiki@localhost/ds_salaries"  
engine = create_engine(DATABASE_URL)  # Create the engine
print("SQLAlchemy connection established.")  # Confirm connection


Using the 'ds_salaries' database.
SQLAlchemy connection established.


#### Question 1: How can we categorize salary ranges?

In [2]:
# Using a CASE statement to categorize salaries into brackets: 
# <50,000 as "Low", 50,000-100,000 as "Medium", >100,000 as "High"
# This query uses a `CASE` statement to classify each job's USD salary into a bracket.  
# It joins the `salaries` and `job_titles` tables to show job titles alongside salary and bracket.

query = """
SELECT 
    s.salary_in_usd,
    j.title AS job_title,
    CASE 
        WHEN s.salary_in_usd < 50000 THEN 'Low'
        WHEN s.salary_in_usd BETWEEN 50000 AND 100000 THEN 'Medium'
        ELSE 'High'
    END AS salary_bracket
FROM salaries s
JOIN job_titles j ON s.job_title = j.id
LIMIT 10;
"""

# Run the SQL query and load the result into a pandas DataFrame
df_q1 = pd.read_sql(text(query), engine)  # 'text()' safely wraps SQL string for execution
df_q1

Unnamed: 0,salary_in_usd,job_title,salary_bracket
0,85847,Principal Data Scientist,Medium
1,220000,Principal Data Scientist,High
2,155499,Principal Data Scientist,High
3,416000,Principal Data Scientist,High
4,173762,Principal Data Scientist,High
5,148261,Principal Data Scientist,High
6,235000,Principal Data Scientist,High
7,151000,Principal Data Scientist,High
8,30000,ML Engineer,Low
9,25500,ML Engineer,Low


#### Question 2: Identify NULL values in any table

In [3]:
# The data has been cleaned and normalized, but as a precaution, this query confirms the absence of NULLs
query2 = """
SELECT *
FROM salaries
WHERE salary IS NULL OR salary_in_usd IS NULL OR work_year IS NULL;
"""
# Run the SQL query and load the result into a pandas DataFrame
df_q2 = pd.read_sql(text(query2), engine)  # 'text()' safely wraps SQL string for execution
df_q2.head()

Unnamed: 0,id,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size


#### Question 3: INSERT INTO a Table (add a new job title)

In [5]:
# Insert an intern position (test)
query3 = """
INSERT INTO job_titles (id, title)
VALUES (9999, 'Machine Learning Engineer - Intern');
"""
with engine.begin() as conn:
    conn.execute(text(query3))

# Confirm insertion
confirm_query = "SELECT * FROM job_titles WHERE id = 9999;"
# Run the SQL query and load the result into a pandas DataFrame
df_q3 = pd.read_sql(text(confirm_query), engine)  # 'text()' safely wraps SQL string for execution
df_q3

Unnamed: 0,id,title
0,9999,Machine Learning Engineer - Intern


#### Question 4: Find the maximum and minimum salaries

In [6]:
# This query determines the MAX and MIN salary storing them under an alias
query4 = """
SELECT 
    MAX(salary_in_usd) AS highest_salary,
    MIN(salary_in_usd) AS lowest_salary
FROM salaries;
"""
# Run the SQL query and load the result into a pandas DataFrame
df_q4 = pd.read_sql(text(query4), engine)  # 'text()' safely wraps SQL string for execution
df_q4

Unnamed: 0,highest_salary,lowest_salary
0,450000,5132


#### Question 5: Use aliases to improve readability

In [7]:
# This query uses an alias create more specific columns
query5 = """
SELECT 
    s.salary_in_usd AS usd_salary,
    jt.title AS job_role,
    cs.size AS company_size
FROM salaries s
JOIN job_titles jt ON s.job_title = jt.id
JOIN company_sizes cs ON s.company_size = cs.id
LIMIT 10;
"""
# Run the SQL query and load the result into a pandas DataFrame
df_q5 = pd.read_sql(text(query5), engine)  # 'text()' safely wraps SQL string for execution
df_q5

Unnamed: 0,usd_salary,job_role,company_size
0,85847,Principal Data Scientist,L
1,222200,Applied Scientist,L
2,136000,Applied Scientist,L
3,213660,Applied Scientist,L
4,130760,Applied Scientist,L
5,100000,Data Quality Analyst,L
6,30000,Compliance Data Analyst,L
7,20984,Machine Learning Engineer,L
8,204620,Applied Scientist,L
9,110680,Applied Scientist,L


#### Question 6: Export the results to a CSV file

In [8]:
# Export the salary brackets dataframe (from Q1)
df_q1.to_csv("salary_brackets.csv", index=False)
print("Exported salary_brackets.csv")

Exported salary_brackets.csv


#### Question 7: Use the DELETE statement

In [9]:
# Removing the test intern job title I inserted in Q3
query7 = """
DELETE FROM job_titles
WHERE id = 9999;
"""
with engine.begin() as conn:
    conn.execute(text(query7))

# Confirm deletion
# Execute SQL query directly (without wrapping in 'text') and return result as a DataFrame
confirm_delete = pd.read_sql("SELECT * FROM job_titles WHERE id = 9999;", engine)
confirm_delete

Unnamed: 0,id,title


#### Question 8: Use of Subqueries

In [10]:
# Subquery example to return only jobs with high salaries (> 150000)
query8 = '''
SELECT *
FROM (
    SELECT j.title AS job_title, s.salary_in_usd
    FROM salaries s
    JOIN job_titles j ON s.job_title = j.id
) AS sub
WHERE salary_in_usd > 150000
LIMIT 10;
'''
# Run the SQL query and load the result into a pandas DataFrame
df_q8 = pd.read_sql(text(query8), engine)  # 'text()' safely wraps SQL string for execution
df_q8

Unnamed: 0,job_title,salary_in_usd
0,Data Scientist,175000
1,Applied Scientist,222200
2,Data Scientist,219000
3,Applied Scientist,213660
4,Data Scientist,170000
5,Research Engineer,275000
6,Research Engineer,174000
7,Analytics Engineer,230000
8,Business Intelligence Engineer,225000
9,Business Intelligence Engineer,156400
