In [1]:
# Step 1: Import libraries
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from dotenv import load_dotenv

# %%
# Step 2: Load environment variables
load_dotenv()

pg_user = os.environ['PG_USER']
pg_password = os.environ['PG_PASSWORD']
pg_host = os.environ['PG_HOST']
pg_port = os.environ['PG_PORT']
pg_db = os.environ['PG_DB']

In [2]:
# Step 3: Create engine
from sqlalchemy import create_engine

engine = create_engine(
    f'postgresql+psycopg2://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_db}'
)

## Diagnostic

### Business Question
Which industries demonstrate the greatest improvement in phishing training success from month 0 to month 12, and how do they rank?


In [4]:
query1 = '''
WITH improvement_cte AS (
    SELECT 
        i.industry_name,
        its.month_0,
        its.month_12,
        (its.month_12 - its.month_0) AS improvement
    FROM raw.industry_training_success its
    JOIN raw.dim_industry i ON its.industry_id = i.industry_id
),
ranked_improvements AS (
    SELECT *,
        RANK() OVER (ORDER BY improvement DESC) AS rank_by_improvement
    FROM improvement_cte
)
SELECT *
FROM ranked_improvements
ORDER BY rank_by_improvement
'''
df_q1 = pd.read_sql(query1, con=engine)
df_q1


Unnamed: 0,industry_name,month_0,month_12,improvement,rank_by_improvement
0,Financial Services,48,74,26,1
1,"Legal, professional, business services",40,61,21,2
2,Oil & energy,47,67,20,3
3,Retail,48,67,19,4
4,"Manufacturing, construction",53,69,16,5
5,Global Success rate,55,70,15,6
6,Government,49,64,15,6
7,"Logistics, supply chain",57,70,13,8
8,"IT, software, internet",54,66,12,9
9,Pharma & healthcare,52,62,10,10


### Insight
The *Financial Services* industry experienced the greatest improvement in phishing training success over a 12-month period, increasing from 48% to 74% — a 26-point gain. Meanwhile, *Pharma & healthcare* showed the least progress, improving only 10 points. This highlights a significant disparity in training effectiveness across industries.

### Recommendation
Prioritize additional support and tailored training for industries with lower improvement rates such as *Pharma & healthcare* and *IT, software, internet*. Investigate possible causes like engagement, delivery methods, or content relevance to enhance outcomes.

### Prediction
If current patterns hold, top-performing industries may stabilize, while lower-performing sectors risk stagnation or decline unless proactive measures are taken. Focused interventions could help bridge the performance gap in the coming year.

### Descriptive

### Business Question
Which departments perform the best and worst in phishing training based on success, miss, and fail rates?


In [5]:
query2 = '''
WITH performance_cte AS (
    SELECT 
        d.department_name,
        j.success_rate_percent,
        j.miss_rate_percent,
        j.fail_rate_percent
    FROM raw.job_role_training_performance j
    JOIN raw.dim_department d ON j.department_id = d.department_id
),
ranked_departments AS (
    SELECT *,
        RANK() OVER (ORDER BY success_rate_percent DESC) AS rank_success,
        RANK() OVER (ORDER BY fail_rate_percent ASC) AS rank_fail
    FROM performance_cte
)
SELECT *
FROM ranked_departments
ORDER BY rank_success
'''

df_q2 = pd.read_sql(query2, con=engine)
df_q2

Unnamed: 0,department_name,success_rate_percent,miss_rate_percent,fail_rate_percent,rank_success,rank_fail
0,Marketing,73,25,2.4,1,3
1,Other,72,25,2.4,2,3
2,Information technology,70,28,2.3,3,1
3,Finance,68,30,2.9,4,8
4,Communications,67,31,2.3,5,1
5,Human resources,67,31,2.8,5,6
6,Sales,66,31,2.8,7,6
7,Customer relationship,65,33,2.7,8,5
8,Information security,65,32,3.0,8,9
9,Legal,64,32,3.8,10,12


In [8]:
query2 = '''
WITH dept_avg_cte AS (
    SELECT 
        d.department_name,
        j.fail_rate_percent
    FROM raw.job_role_training_performance j
    JOIN raw.dim_department d 
        ON j.department_id = d.department_id
),
overall_avg_cte AS (
    SELECT 
        AVG(fail_rate_percent) AS avg_fail_rate
    FROM dept_avg_cte
),
fail_rate_deviation AS (
    SELECT 
        d.*,
        o.avg_fail_rate,
        (d.fail_rate_percent - o.avg_fail_rate) AS deviation_from_avg
    FROM dept_avg_cte d
    CROSS JOIN overall_avg_cte o
),
ranked_failures AS (
    SELECT *,
        RANK() OVER (ORDER BY deviation_from_avg DESC) AS rank_by_deviation
    FROM fail_rate_deviation
)
SELECT *
FROM ranked_failures
ORDER BY rank_by_deviation DESC;
'''
df_q2 = pd.read_sql(query2, con=engine)
pd.set_option('display.max_rows', None)
df_q2

Unnamed: 0,department_name,fail_rate_percent,avg_fail_rate,deviation_from_avg,rank_by_deviation
0,Information technology,2.3,2.816667,-0.516667,11
1,Communications,2.3,2.816667,-0.516667,11
2,Marketing,2.4,2.816667,-0.416667,9
3,Other,2.4,2.816667,-0.416667,9
4,Customer relationship,2.7,2.816667,-0.116667,8
5,Sales,2.8,2.816667,-0.016667,6
6,Human resources,2.8,2.816667,-0.016667,6
7,Finance,2.9,2.816667,0.083333,5
8,Information security,3.0,2.816667,0.183333,4
9,Business development,3.2,2.816667,0.383333,2


### Insight
The **Legal** department has the highest deviation from the average phishing failure rate at **+0.98**, making it the most vulnerable group. On the other hand, **Information Technology** and **Communications** have the lowest deviation (**-0.52**), indicating they are performing significantly better than average.

### Recommendation
Focus additional training and awareness campaigns on departments like **Legal**, **Software Engineering**, and **Business Development**, as they have the highest failure deviations. Tailored training could help reduce their phishing susceptibility and improve overall security posture.

### Prediction
If left unaddressed, departments with high deviation from the average fail rate will likely continue to be weak points in the organization's phishing defense, potentially leading to more successful phishing incidents. Remediation efforts can reduce this risk over time.
