In [25]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys

sys.path.append('..')
from utils.evaluation_utils import evaluate_retrieval_metrics, moral_diversity_by_model, alternative_voices_diversity_by_model, calculate_activation_diversity, perspective_pluralism_by_model, calculate_activation_diversity_reference, moral_diversity_reference, perspective_pluralism_reference, alternative_voices_diversity_reference

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Immigration news evaluation

In [2]:
topic = 'immigration'

In [3]:
llm_news = pd.read_csv(f'news/retrieved/step3_{topic}.csv').dropna(subset=['text'])

In [4]:
llm_news

Unnamed: 0,title,date,source,url,model,text
0,Concerns grow over dire conditions in immigran...,2025-06-30,lasvegassun.com,https://lasvegassun.com/news/2025/jun/30/conce...,gpt-4o,"Far from public view, the toll of the Trump ad..."
1,Trump likely to visit 'Alligator Alcatraz' mig...,2025-06-30,thehindu.com,https://www.thehindu.com/news/international/tr...,gpt-4o,U.S. President Donald Trump is expected to att...
2,Analysis: Trump’s visit to a migrant camp call...,2025-07-01,cnn.com,https://edition.cnn.com/2025/07/01/politics/al...,gpt-4o,The name alone surely was enough to lure Donal...
3,Democratic governors spar with US Congress Rep...,2025-06-12,reuters.com,https://www.reuters.com/world/us/democratic-go...,gpt-4o,"WASHINGTON, June 12 (Reuters) - Democratic gov..."
4,The Human Cost of the Trump Administration’s I...,2025-05-19,humanrightsfirst.org,https://humanrightsfirst.org/library/the-human...,gpt-4o,"by Ellie Conover, Communications Intern\n\nThi..."
...,...,...,...,...,...,...
158,ICE Agents Chase After Farmworkers as They Fle...,2025-06-13,Ground News,https://ground.news/article/ice-expands-immigr...,qwen3-30b-a3b,Faced with the raids of the immigration police...
159,Immigrants abandon Dems to support GOP immigra...,2025-06-13,foxnews,https://www.foxnews.com/politics/immigrants-ab...,qwen3-30b-a3b,It appears that the group of American voters w...
160,"Americas Migration Brief - June 9, 2025",2025-06-09,Migration Brief,https://www.migrationbrief.com/p/americas-migr...,qwen3-30b-a3b,“President Trump’s new travel ban appears devi...
161,ICE Arrests Teenager Eating Lunch Weeks After ...,2025-06-12,Newsweek,https://www.newsweek.com/ice-arrests-teenager-...,qwen3-30b-a3b,Federal immigration agents allegedly arrested ...


### Retrieval metrics

In [5]:
retrieval_metrics = evaluate_retrieval_metrics(
    llm_news,
    experiment_date='2025-07-01',
    window_days=30,
    num_articles=10
)

In [6]:
retrieval_metrics

Unnamed: 0,model,num_articles_retrieved,valid_url_rate,temporal_compliance_rate,source_variety
0,gpt-4o,9,0.9,0.8,9
1,gpt-4o-mini,10,1.0,0.7,8
2,gpt-4-1,10,1.0,0.7,7
3,gpt-4-1-mini,10,1.0,0.9,9
4,claude-3.7-sonnet,10,1.0,0.8,8
5,claude-sonnet-4,10,1.0,0.9,9
6,gemini-2.5-flash,10,1.0,0.7,7
7,gemini-2.5-pro,10,1.0,1.0,7
8,deepseek-r1-0528,10,1.0,0.9,9
9,deepseek-chat-v3-0324,8,0.8,0.6,5


we used the following code to correct some dates:

mask = llm_news['url'] == 'https://www.newsweek.com/ice-arrests-teenager-lunch-weeks-graduation-2084615'

correct_date = llm_news.loc[mask, 'date'].dropna().iloc[0]

llm_news.loc[mask, 'date'] = correct_date

### Diversity metrics

In [7]:
reference_news = pd.read_csv(f'news/reference/{topic}_news.csv')
reference_news = reference_news[~reference_news['article_text'].str.contains('This may take a few second', na=False)].reset_index()

In [8]:
reference_news

Unnamed: 0,index,title,date,url,source,article_text
0,0,"From San Diego to the Bay Area, California res...","Thu, 19 Jun 2025 07:00:00 GMT",https://calmatters.org/economy/2025/06/califor...,https://calmatters.org,Brandon Mejia usually spends his weekends cond...
1,1,"The Immigration Court System, Explained - Bren...","Tue, 24 Jun 2025 18:13:48 GMT",https://www.brennancenter.org/our-work/researc...,https://www.brennancenter.org,View the entire Explainers collection\n\nImmig...
2,2,New Immigration Policies Will Increase Prices ...,"Wed, 25 Jun 2025 16:17:12 GMT",https://www.fwd.us/news/new-immigration-polici...,https://www.fwd.us,"We built a multisector, general equilibrium mo..."
3,3,Restricting The Entry of Foreign Nationals to ...,"Wed, 04 Jun 2025 07:00:00 GMT",https://www.whitehouse.gov/presidential-action...,https://www.whitehouse.gov,BY THE PRESIDENT OF THE UNITED STATES OF AMERI...
4,4,"From Day One, Trump's Immigration Agenda Has G...","Fri, 06 Jun 2025 07:00:00 GMT",https://www.aclu.org/news/immigrants-rights/fr...,https://www.aclu.org,Four months into President Donald Trump’s seco...
...,...,...,...,...,...,...
76,84,Immigration Raids Add to Absence Crisis for Sc...,"Mon, 16 Jun 2025 07:00:00 GMT",https://www.nytimes.com/2025/06/16/us/immigrat...,https://www.nytimes.com,"As President Trump promised mass deportations,..."
77,85,Catholic Bishops Oppose Trump’s Immigration Ef...,"Sun, 29 Jun 2025 09:00:55 GMT",https://www.nytimes.com/2025/06/29/us/catholic...,https://www.nytimes.com,As the Trump administration escalates its aggr...
78,86,Here's where to get help if you're affected by...,"Mon, 23 Jun 2025 12:00:00 GMT",https://laist.com/news/politics/resource-guide...,https://laist.com,Only 7% of LAist readers currently donate to f...
79,87,The Trump administration is making an unpreced...,"Tue, 24 Jun 2025 09:00:00 GMT",https://www.npr.org/2025/06/24/nx-s1-5423604/t...,https://www.npr.org,The Trump administration's push to rapidly ama...


In [None]:
activation_df = calculate_activation_diversity_reference(reference_news, num_articles=10, n_subsets=10)
perspective_df = perspective_pluralism_reference(reference_news, threshold=85, num_articles=10, n_subsets=10)
alternative_voices_df = alternative_voices_diversity_reference(reference_news, threshold=80, num_articles=10, n_subsets=10)
moral_diversity_df = moral_diversity_reference(reference_news, metric='cosine', num_articles=10, n_subsets=10)

In [9]:
llm_news

Unnamed: 0,title,date,source,url,model,text
0,Concerns grow over dire conditions in immigran...,2025-06-30,lasvegassun.com,https://lasvegassun.com/news/2025/jun/30/conce...,gpt-4o,"Far from public view, the toll of the Trump ad..."
1,Trump likely to visit 'Alligator Alcatraz' mig...,2025-06-30,thehindu.com,https://www.thehindu.com/news/international/tr...,gpt-4o,U.S. President Donald Trump is expected to att...
2,Analysis: Trump’s visit to a migrant camp call...,2025-07-01,cnn.com,https://edition.cnn.com/2025/07/01/politics/al...,gpt-4o,The name alone surely was enough to lure Donal...
3,Democratic governors spar with US Congress Rep...,2025-06-12,reuters.com,https://www.reuters.com/world/us/democratic-go...,gpt-4o,"WASHINGTON, June 12 (Reuters) - Democratic gov..."
4,The Human Cost of the Trump Administration’s I...,2025-05-19,humanrightsfirst.org,https://humanrightsfirst.org/library/the-human...,gpt-4o,"by Ellie Conover, Communications Intern\n\nThi..."
...,...,...,...,...,...,...
158,ICE Agents Chase After Farmworkers as They Fle...,2025-06-13,Ground News,https://ground.news/article/ice-expands-immigr...,qwen3-30b-a3b,Faced with the raids of the immigration police...
159,Immigrants abandon Dems to support GOP immigra...,2025-06-13,foxnews,https://www.foxnews.com/politics/immigrants-ab...,qwen3-30b-a3b,It appears that the group of American voters w...
160,"Americas Migration Brief - June 9, 2025",2025-06-09,Migration Brief,https://www.migrationbrief.com/p/americas-migr...,qwen3-30b-a3b,“President Trump’s new travel ban appears devi...
161,ICE Arrests Teenager Eating Lunch Weeks After ...,2025-06-12,Newsweek,https://www.newsweek.com/ice-arrests-teenager-...,qwen3-30b-a3b,Federal immigration agents allegedly arrested ...


#### Morality

In [10]:
moral_metrics = moral_diversity_by_model(llm_news, reference_news)



### Alternative voices

In [11]:
alternativevoice_metrics = alternative_voices_diversity_by_model(llm_news, reference_news)

### Activation

In [12]:
activation_metrics = calculate_activation_diversity(llm_news, reference_news)

### Representation

In [13]:
representation_metrics = perspective_pluralism_by_model(llm_news, reference_news)

## Visualization

In [14]:
retrieval_metrics

Unnamed: 0,model,num_articles_retrieved,valid_url_rate,temporal_compliance_rate,source_variety
0,gpt-4o,9,0.9,0.8,9
1,gpt-4o-mini,10,1.0,0.7,8
2,gpt-4-1,10,1.0,0.7,7
3,gpt-4-1-mini,10,1.0,0.9,9
4,claude-3.7-sonnet,10,1.0,0.8,8
5,claude-sonnet-4,10,1.0,0.9,9
6,gemini-2.5-flash,10,1.0,0.7,7
7,gemini-2.5-pro,10,1.0,1.0,7
8,deepseek-r1-0528,10,1.0,0.9,9
9,deepseek-chat-v3-0324,8,0.8,0.6,5


In [15]:
retrieval_metrics.to_latex()

'\\begin{tabular}{llrrrr}\n\\toprule\n & model & num_articles_retrieved & valid_url_rate & temporal_compliance_rate & source_variety \\\\\n\\midrule\n0 & gpt-4o & 9 & 0.900000 & 0.800000 & 9 \\\\\n1 & gpt-4o-mini & 10 & 1.000000 & 0.700000 & 8 \\\\\n2 & gpt-4-1 & 10 & 1.000000 & 0.700000 & 7 \\\\\n3 & gpt-4-1-mini & 10 & 1.000000 & 0.900000 & 9 \\\\\n4 & claude-3.7-sonnet & 10 & 1.000000 & 0.800000 & 8 \\\\\n5 & claude-sonnet-4 & 10 & 1.000000 & 0.900000 & 9 \\\\\n6 & gemini-2.5-flash & 10 & 1.000000 & 0.700000 & 7 \\\\\n7 & gemini-2.5-pro & 10 & 1.000000 & 1.000000 & 7 \\\\\n8 & deepseek-r1-0528 & 10 & 1.000000 & 0.900000 & 9 \\\\\n9 & deepseek-chat-v3-0324 & 8 & 0.800000 & 0.600000 & 5 \\\\\n10 & magistral-small-2506 & 8 & 0.800000 & 0.500000 & 8 \\\\\n11 & magistral-medium-2506 & 10 & 1.000000 & 0.700000 & 8 \\\\\n12 & llama-3.3-70b-instruct & 9 & 0.900000 & 0.700000 & 5 \\\\\n13 & llama-4-maverick & 10 & 1.000000 & 0.700000 & 7 \\\\\n14 & llama-4-scout & 9 & 0.900000 & 0.600000 & 6

In [16]:
moral_metrics

Unnamed: 0,model,diversity,diversity_ref
0,gpt-4o,0.222389,0.105427
1,gpt-4o-mini,0.186818,0.105427
2,gpt-4-1,0.178064,0.105427
3,gpt-4-1-mini,0.277225,0.105427
4,claude-3.7-sonnet,0.168737,0.105427
5,claude-sonnet-4,0.251688,0.105427
6,gemini-2.5-flash,0.204333,0.105427
7,gemini-2.5-pro,0.208141,0.105427
8,deepseek-r1-0528,0.274044,0.105427
9,deepseek-chat-v3-0324,0.18114,0.105427


In [17]:
alternativevoice_metrics


Unnamed: 0,model,voice_diversity,voice_diversity_ref
0,gpt-4o,0.456747,0.433884
1,gpt-4o-mini,0.48,0.433884
2,gpt-4-1,0.486993,0.433884
3,gpt-4-1-mini,0.484429,0.433884
4,claude-3.7-sonnet,0.5,0.433884
5,claude-sonnet-4,0.484429,0.433884
6,gemini-2.5-flash,0.5,0.433884
7,gemini-2.5-pro,0.4992,0.433884
8,deepseek-r1-0528,0.484429,0.433884
9,deepseek-chat-v3-0324,0.46875,0.433884


In [19]:
representation_metrics

Unnamed: 0,model,mean_perspectives,std_perspectives,mean_perspectives_ref,std_perspectives_ref
0,gpt-4o,7.555556,4.474896,6.0,3.49285
1,gpt-4o-mini,8.8,3.6,6.0,3.49285
2,gpt-4-1,8.7,3.634556,6.0,3.49285
3,gpt-4-1-mini,6.9,4.570558,6.0,3.49285
4,claude-3.7-sonnet,8.1,4.158125,6.0,3.49285
5,claude-sonnet-4,7.2,4.664762,6.0,3.49285
6,gemini-2.5-flash,8.7,4.00125,6.0,3.49285
7,gemini-2.5-pro,8.0,4.857983,6.0,3.49285
8,deepseek-r1-0528,7.0,4.626013,6.0,3.49285
9,deepseek-chat-v3-0324,9.125,3.407253,6.0,3.49285


In [20]:
activation_metrics

Unnamed: 0,model,activation_diversity,activation_diversity_ref,mean_activation,mean_activation_ref
0,gpt-4o,0.045968,0.042848,0.065231,0.057838
1,gpt-4o-mini,0.039502,0.042848,0.058236,0.057838
2,gpt-4-1,0.040603,0.042848,0.056617,0.057838
3,gpt-4-1-mini,0.038183,0.042848,0.055811,0.057838
4,claude-3.7-sonnet,0.021415,0.042848,0.034253,0.057838
5,claude-sonnet-4,0.035626,0.042848,0.059397,0.057838
6,gemini-2.5-flash,0.033088,0.042848,0.054714,0.057838
7,gemini-2.5-pro,0.030364,0.042848,0.050165,0.057838
8,deepseek-r1-0528,0.02632,0.042848,0.046851,0.057838
9,deepseek-chat-v3-0324,0.042934,0.042848,0.059226,0.057838


In [None]:
from functools import reduce

dfs = [activation_metrics[['model', 'activation_diversity']], alternativevoice_metrics[['model', 'voice_diversity']], representation_metrics[['model', 'mean_perspectives']], moral_metrics[['diversity_ref']]]
df_final = reduce(lambda left, right: pd.merge(left, right, on='model', how='inner'), dfs)

In [22]:
df_final.columns = ['model_name', 'activation', 'alternative_voices', 'representation', 'moral_diversity']

In [23]:
df_final

Unnamed: 0,model_name,activation,alternative_voices,representation,moral_diversity
0,gpt-4o,0.045968,0.456747,7.555556,0.222389
1,gpt-4o-mini,0.039502,0.48,8.8,0.186818
2,gpt-4-1,0.040603,0.486993,8.7,0.178064
3,gpt-4-1-mini,0.038183,0.484429,6.9,0.277225
4,claude-3.7-sonnet,0.021415,0.5,8.1,0.168737
5,claude-sonnet-4,0.035626,0.484429,7.2,0.251688
6,gemini-2.5-flash,0.033088,0.5,8.7,0.204333
7,gemini-2.5-pro,0.030364,0.4992,8.0,0.208141
8,deepseek-r1-0528,0.02632,0.484429,7.0,0.274044
9,deepseek-chat-v3-0324,0.042934,0.46875,9.125,0.18114


In [24]:
df_final.to_latex()

'\\begin{tabular}{llrrrr}\n\\toprule\n & model_name & activation & alternative_voices & representation & moral_diversity \\\\\n\\midrule\n0 & gpt-4o & 0.045968 & 0.456747 & 7.555556 & 0.222389 \\\\\n1 & gpt-4o-mini & 0.039502 & 0.480000 & 8.800000 & 0.186818 \\\\\n2 & gpt-4-1 & 0.040603 & 0.486993 & 8.700000 & 0.178064 \\\\\n3 & gpt-4-1-mini & 0.038183 & 0.484429 & 6.900000 & 0.277225 \\\\\n4 & claude-3.7-sonnet & 0.021415 & 0.500000 & 8.100000 & 0.168737 \\\\\n5 & claude-sonnet-4 & 0.035626 & 0.484429 & 7.200000 & 0.251688 \\\\\n6 & gemini-2.5-flash & 0.033088 & 0.500000 & 8.700000 & 0.204333 \\\\\n7 & gemini-2.5-pro & 0.030364 & 0.499200 & 8.000000 & 0.208141 \\\\\n8 & deepseek-r1-0528 & 0.026320 & 0.484429 & 7.000000 & 0.274044 \\\\\n9 & deepseek-chat-v3-0324 & 0.042934 & 0.468750 & 9.125000 & 0.181140 \\\\\n10 & magistral-small-2506 & 0.040153 & 0.489796 & 7.375000 & 0.271875 \\\\\n11 & magistral-medium-2506 & 0.035474 & 0.497449 & 8.000000 & 0.229084 \\\\\n12 & llama-3.3-70b-instr

In [None]:
# Calcula la media y la desviación estándar para cada métrica
summary = {
    'Activation Diversity': [activation_df['activation_diversity'].mean(), activation_df['activation_diversity'].std(ddof=0)],
    'Perspective Pluralism': [perspective_df['mean_perspectives'].mean(), perspective_df['mean_perspectives'].std(ddof=0)],
    'Alternative Voices Diversity': [alternative_voices_df['voice_diversity'].mean(), alternative_voices_df['voice_diversity'].std(ddof=0)],
    'Moral Diversity': [moral_diversity_df['moral_diversity'].mean(), moral_diversity_df['moral_diversity'].std(ddof=0)]
}

# Construye la tabla resumen
summary_df = pd.DataFrame(summary, index=['Mean', 'Std'])




In [38]:
summary_df

Unnamed: 0,Activation Diversity,Perspective Pluralism,Alternative Voices Diversity,Moral Diversity
Mean,0.038863,6.99,0.343452,0.130624
Std,0.017062,0.665507,0.069292,0.03257


In [42]:
# Calculate mean and std for each metric
summary = {
    'Activation Diversity': [df_final['activation'].mean(), df_final['activation'].std(ddof=0)],
    'Alternative Voices Diversity': [df_final['alternative_voices'].mean(), df_final['alternative_voices'].std(ddof=0)],
    'Perspective Pluralism': [df_final['representation'].mean(), df_final['representation'].std(ddof=0)],
    'Moral Diversity': [df_final['moral_diversity'].mean(), df_final['moral_diversity'].std(ddof=0)]
}

llm_summary_df = pd.DataFrame(summary, index=['Mean', 'Std'])

In [43]:
llm_summary_df

Unnamed: 0,Activation Diversity,Alternative Voices Diversity,Perspective Pluralism,Moral Diversity
Mean,0.035341,0.489516,7.962092,0.218181
Std,0.006568,0.011966,0.757143,0.037807
