In [1]:

from pathlib import Path
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip


#paths

NB_DIR = Path.cwd()                    
ROOT = NB_DIR.parent                    

print("Notebook directory:", NB_DIR)
print("Project root:", ROOT)

# Delta table paths
GOLD_DELTA = ROOT / "pipelines" / "gold" / "gold_delta"
CLUSTER_MACRO = ROOT / "clustering" / "cluster_macro_cleaned"

print("Gold Delta path:", GOLD_DELTA)
print("Macro Cluster Delta path:", CLUSTER_MACRO)

#spark
builder = (
    SparkSession.builder
        .appName("macro_eda")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.driver.memory", "8g")
        .config("spark.sql.shuffle.partitions", "16")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
print("Spark session initialised.")


#load delta
df_gold = spark.read.format("delta").load(str(GOLD_DELTA))
df_macro = spark.read.format("delta").load(str(CLUSTER_MACRO))

#gold has legacy macro_cluster column my bad
if "macro_cluster" in df_gold.columns:
    df_gold = df_gold.drop("macro_cluster")
    print("Removed legacy macro_cluster column from gold table.")

df_join = df_gold.join(df_macro, on="url", how="inner")
df_join.show(5, truncate=False)
df_join.count() 


Notebook directory: /home/david/School/CapStone/clustering
Project root: /home/david/School/CapStone
Gold Delta path: /home/david/School/CapStone/pipelines/gold/gold_delta
Macro Cluster Delta path: /home/david/School/CapStone/clustering/cluster_macro_cleaned


25/12/06 15:56:30 WARN Utils: Your hostname, david-ThinkPad-T490 resolves to a loopback address: 127.0.1.1; using 172.16.0.186 instead (on interface wlp0s20f3)
25/12/06 15:56:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/david/School/CapStone/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/david/.ivy2/cache
The jars for the packages stored in: /home/david/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d4fecd29-244f-4c06-a765-004cbc6ef5cb;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 1303ms :: artifacts dl 67ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0 

Spark session initialised.


25/12/06 15:56:49 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Removed legacy macro_cluster column from gold table.


25/12/06 15:57:11 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------------------------------------------------------------------------------------------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

1949081

In [2]:
import pyspark.sql.functions as F

df_small = df_join.select("macro_final", "url", "text")

for mc in range(11):
    print(f"\n===== Cluster {mc} =====")
    (
        df_small
            .filter(F.col("macro_final") == mc)
            .orderBy(F.rand())      # fresh randomness, cluster-local shuffle
            .limit(10)
            .show(truncate=120)
    )



===== Cluster 0 =====


                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          0|https://www.reuters.com/article/brief-comtech-telecommunications-says-it/brief-comtech-telecommunications-says-its-en...|BRIEF-Comtech Telecommunications says its Enterprise Technologies group received a $3.7 million or

                                                                                

+-----------+-----------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                    url|                                                                                                                    text|
+-----------+-----------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          1|                https://www.foxnews.com/transcript/critics-too-quick-to-criticize-president-trump-on-north-korea-summit|Critics too quick to criticize President Trump on North Korea summit?\n\nclose Video Gutfeld on the Tr

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          2|                         https://www.cnbc.com/2020/01/29/coronavirus-impact-on-air-travel-a-concern-boeing-cfo-says.html|Coronavirus impact on air travel a concern, Boeing CFO says\n\nBoeing raised concerns on Wednesday

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          3|                           https://www.cnbc.com/2019/04/22/disney-heiress-calls-bob-igers-total-compensation-insane.html|Disney heiress calls Bob Iger's total compensation 'insane'\n\nDisney CEO Bob Iger's $65.6 million

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          4|https://www.reuters.com/article/us-guatemala-election-immigration/guatemalas-next-president-says-safe-third-country-d...|Guatemala's next president says 'safe third country' deal needs two congress ratifications\n\n(Reu

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          5|           https://techcrunch.com/2016/05/27/althea-raises-3-5m-to-sell-korean-beauty-products-online-in-southeast-asia/|Althea raises $3.5M to sell Korean beauty products online in Southeast Asia – TechCrunch\n\nAlthea

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          6|                https://thehill.com/policy/healthcare/312000-sanders-dem-leaders-urge-day-of-rallies-to-save-health-care|Sanders, Dem leaders urge day of rallies to 'save health care' | TheHill\n\nDemocratic leaders are

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          7|https://www.reuters.com/article/axon-deepfakes/axon-boosts-encryption-weighs-blockchain-to-tackle-body-cam-deepfakes-...|Axon boosts encryption, weighs blockchain to tackle body-cam 'deepfakes'\n\nSAN FRANCISCO (Reuters

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          8|                                              https://www.nytimes.com/2019/02/08/crosswords/daily-puzzle-2019-02-09.html|Orange Ball\n\nDAILY CROSSWORD COLUMN Ryan McCarty lets us indulge but makes us exercise. Everyone

                                                                                

+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|macro_final|                                                                                                                     url|                                                                                                                    text|
+-----------+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|          9|https://www.reuters.com/article/eurozone-economy-moneysupply/euro-zone-lending-holds-steady-m3-jumps-in-dec-ecb-idUSE...|Euro zone lending holds steady, M3 jumps in Dec: ECB\n\nFRANKFURT, Jan 28 (Reuters) - Corporate le

                                                                                

+-----------+---+----+
|macro_final|url|text|
+-----------+---+----+
+-----------+---+----+



### Cluster Stability

*this admittedly is a crude way to test cluster stability but effectively, im drawing samples from clusters at random, then feeding them to seperate llms to see if over the random samples themes remain broadly consistent, and then getting an average theme over the samples*

-samples are stored in /sample articles for reproduciblity.

-LLM models share no context between samples reducing bias

-multiple LLM models are chatgpt5.1 claude sonnet 4 and grok

**prompt used**

"""You are given a set of sample news articles (URL + text). Your task is to infer coherent thematic clusters from the sample and assign each cluster a simple, high-level name. PROCESS: 1. Read the articles. 2. Identify the natural groupings (clusters) based on shared themes. 3. Assign each cluster: - a numerical cluster ID (0,1,2,3,...) - a short, clear theme name (e.g., “Tech News”, “Politics”, “Finance & Markets”) 4. Produce a single Markdown table with: | Cluster | Theme | RULES: - Do NOT provide explanations. - Do NOT describe the articles individually. - Do NOT add sentences outside the table. - Keep cluster names short (2–5 words). - Only output the Markdown table. Here are the sample articles: [PASTE ARTICLES HERE]:"""


| Cluster | Sample 1 Theme (GPT 5.1)               | Sample 2 Theme (Claude Sonnet 4.5)    | Sample 3 Theme (Grok 2)              | Final One-Word Theme |
|---------|-----------------------------------------|----------------------------------------|---------------------------------------|------------------------|
| 0       | Corporate Briefs & Business Updates     | Corporate Earnings & Finance           | Corporate Finance & Deals             | Earnings               |
| 1       | Politics, Society & Public Affairs      | Politics & International Affairs       | US Domestic Politics                  | Politics               |
| 2       | Global Macro, Trade & Geopolitics       | Trade & Geopolitics                    | International Relations & Security    | Geopolitics            |
| 3       | Tech, Industry & Corporate Moves        | Business & Markets                     | Company Operations & Results          | Operations             |
| 4       | World Events, Crime & Human Interest    | International Politics & Policy        | Global Miscellaneous News             | World                  |
| 5       | Deals, Media & Consumer Tech            | Corporate Strategy & Tech              | Tech & Consumer Products              | Tech                   |
| 6       | U.S. Politics & Culture News            | US Politics & Elections                | US Healthcare & Political Campaigns   | USPolitics             |
| 7       | Law, Rights, Governance & Tech Policy   | Legal & Government Affairs             | Legal System & Justice Issues         | Legal                  |
| 8       | Personal Finance, Lifestyle & Culture   | Lifestyle & Economy                    | Lifestyle & Culture                   | Lifestyle              |
| 9       | Markets, Economy & Financial Commentary | Energy & Economic Indicators           | Markets & Economic Updates            | Markets                |


In [3]:
from pyspark.sql import functions as F

print("### Macro Cluster Size Distribution ###")
(
    df_join.groupBy("macro_final")
           .count()
           .orderBy("macro_final")
           .show(50)
)

stage1_pca = [f"pca_{i}" for i in range(50)]

print("### PCA Feature Means per Macro Cluster ###")
pca_means = (
    df_join.groupBy("macro_final")
           .agg(*[F.avg(c).alias(c) for c in stage1_pca])
           .orderBy("macro_final")
)

pca_means.show(20, truncate=False)

lm_cols = ["lm_pos", "lm_neg", "lm_unc", "lm_lit", "lm_con"]

print("### LM Lexicon Means per Macro Cluster ###")
lm_means = (
    df_join.groupBy("macro_final")
           .agg(*[F.avg(c).alias(f"avg_{c}") for c in lm_cols])
           .orderBy("macro_final")
)

lm_means.show(20, truncate=False)

ner_cols = ["ner_org", "ner_gpe", "ner_person", "ner_money"]

print("### NER Means per Macro Cluster ###")
ner_means = (
    df_join.groupBy("macro_final")
           .agg(*[F.avg(c).alias(f"avg_{c}") for c in ner_cols])
           .orderBy("macro_final")
)

ner_means.show(30, truncate=False)

basic_cols = ["len_chars_gold", "num_words", "avg_word_len"]

print("### Basic Text Statistics per Macro Cluster ###")
basic_means = (
    df_join.groupBy("macro_final")
           .agg(*[F.avg(c).alias(f"avg_{c}") for c in basic_cols])
           .orderBy("macro_final")
)

basic_means.show(20, truncate=False)


### Macro Cluster Size Distribution ###


                                                                                

+-----------+------+
|macro_final| count|
+-----------+------+
|         -1|389729|
|          0|214195|
|          1| 67298|
|          2|122231|
|          3|181879|
|          4|158493|
|          5|173369|
|          6|200545|
|          7|136944|
|          8|188747|
|          9|115651|
+-----------+------+

### PCA Feature Means per Macro Cluster ###


                                                                                

+-----------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+----------------------+---------------------+----------------------+----------------------+---------------------+---------------------+---------------------+---------------------+----------------------+----------------------+---------------------+---------------------+----------------------+---------------------+---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+---------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+---------------------+----------------------+----------------------+----------------------+----------------------+----------------------+---------------------+----------------------+-------------------

                                                                                

+-----------+-------------------+------------------+-------------------+-------------------+-------------------+
|macro_final|avg_lm_pos         |avg_lm_neg        |avg_lm_unc         |avg_lm_lit         |avg_lm_con         |
+-----------+-------------------+------------------+-------------------+-------------------+-------------------+
|-1         |3.1352863143363723 |8.286606847322114 |2.6615083814650693 |1.8365171696229945 |0.7577804063849496 |
|0          |0.8176754826209762 |1.269987628095894 |0.6916921496766965 |0.16227736408412896|0.0993067065057541 |
|1          |9.870456774346934  |34.135234330886504|9.611221730214865  |8.643050313530862  |3.724924960622901  |
|2          |3.01670607292749   |9.711202559088939 |2.8893488558548976 |1.2485621487184102 |0.9905752223249421 |
|3          |0.47997844720940847|1.135639628544252 |0.45945931086051717|0.3443938002738084 |0.12509965416568158|
|4          |1.797953221908854  |6.896941820774419 |1.858498482582827  |1.1296397948174368 |0.55

                                                                                

+-----------+------------------+------------------+------------------+-------------+
|macro_final|avg_ner_org       |avg_ner_gpe       |avg_ner_person    |avg_ner_money|
+-----------+------------------+------------------+------------------+-------------+
|-1         |6.782120396480631 |4.524533714452863 |6.445614773342502 |0.0          |
|0          |5.7720208221480425|1.4280305329256051|1.8682695674502205|0.0          |
|1          |5.411111771523671 |4.854973401884157 |8.238149722131416 |0.0          |
|2          |5.909441958259362 |10.920560250672906|6.2246811365365575|0.0          |
|3          |6.754677560356061 |1.3809730645099214|1.776807657838453 |0.0          |
|4          |5.430050538509587 |6.8182001728782975|4.9415116125002365|0.0          |
|5          |16.13961550219474 |2.963130663498088 |3.2807133916674838|0.0          |
|6          |6.627415293325687 |4.3941758707522   |16.0264279837443  |0.0          |
|7          |7.258390290921836 |4.419346594228298 |9.923209487089



+-----------+------------------+------------------+------------------+
|macro_final|avg_len_chars_gold|avg_num_words     |avg_avg_word_len  |
+-----------+------------------+------------------+------------------+
|-1         |3329.687205725004 |550.493650716267  |6.132765019154388 |
|0          |903.3293120754453 |148.17073227666378|6.206314920030655 |
|1          |11658.862209872508|1918.3025795714582|6.133500816375941 |
|2          |3136.109145797711 |509.1822614557682 |6.1785388717979535|
|3          |695.8893989960359 |110.9828787270658 |6.36252635663045  |
|4          |2321.459383064236 |379.5928085152026 |6.1504310531291555|
|5          |2491.50907601705  |403.92361379485374|6.183223271566949 |
|6          |2854.320486673814 |468.1168266473859 |6.117843365280271 |
|7          |4124.493179693889 |662.5729203177941 |6.247410177159705 |
|8          |5505.985324270055 |918.3708244369446 |6.007609427075451 |
|9          |2471.1187451902706|406.15622000674443|6.1057655726412925|
+-----

                                                                                