# Assessing the Sentiment of American International Affairs' Experts
> Eric GutiÃ©rrez, 10th February 2026

### 0. Motivation
We want to assess the sentiment of American International Affairs' experts through the years. In order to do so, we use the reports published by the Council on Foreign Relations (CFR) from 1998 to 2026. We obtain the most distinct concepts that appear in a given year's reports. In addition, we also use a dictionary method to generate an index on supply-chain challenges through the years.

### 1. Scrapping

In [None]:
import pandas as pd
import time
import numpy as np
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import json


def init_driver():
    options = uc.ChromeOptions()
    #options.add_argument("--headless=new") # Uncomment to run invisibly
    driver = uc.Chrome(options=options, version_main=144)
    driver.set_page_load_timeout(20)
    return driver

# Initialize Driver
driver = init_driver()

link = "https://www.cfr.org/reports"

reports_href = []

for page in range(1,35):
    try:
        driver.get(link+f"?page={page}")
        print(f'Loading page {page}...')
        print(len(reports_href))
        cards = driver.find_elements(By.XPATH, "//a[contains(@class, 'block before:absolute before:inset-0 before:z-1 hover:link-underline')]")
        for rep in cards:
            reports_href.append(rep.get_attribute("href"))
        time.sleep(10)
    except:
        continue

reports = []

ids = 0

for num, url in enumerate(reports_href):
    print(f'Retrieving info from url {num}/{len(reports_href)}')
    try:
        driver.get(url)
        time.sleep(2)
        content = driver.find_element(By.ID, "page-content")
        date = driver.find_element(By.XPATH, "//time[contains(@class, 'mx-auto my-6 text-center type-sans-body7')]")
        reports.append({"report_id": ids, "title": driver.title, "date": date.text, "body": content.text})
        ids = ids + 1
        time.sleep(2)
    except:
        continue

reports_df = pd.DataFrame(reports)

reports_df.to_csv('data/cfr_complete_11Feb.csv')
print('Reports successfully saved!')

In [None]:
for num, url in enumerate(reports_href[158:]):
    print(f'Retrieving info from url {num}/{len(reports_href)}')
    try:
        driver.get(url)
        time.sleep(2)
        content = driver.find_element(By.ID, "page-content")
        date = driver.find_element(By.XPATH, "//time[contains(@class, 'mx-auto my-6 text-center type-sans-body7')]")
        reports.append({"report_id": ids, "title": driver.title, "date": date.text, "body": content.text})
        ids = ids + 1
        time.sleep(2)
    except:
        continue

reports_df = pd.DataFrame(reports)

reports_df.to_csv('data/cfr_complete_11Feb.csv')
print('Reports successfully saved!')

### 2. Text Preprocessing

In [None]:
corpus = pd.read_csv('data/cfr_complete_10Feb.csv', index_col='report_id')
sp = spacy.load('en_core_web_sm')

def lemmatize(txt):
    txt = txt.split("ACKNOWLEDGMENTS")[0]
    doc = sp(txt)
    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_.strip() != '']
    return " ".join(lemmatized_tokens)

corpus["text_preproc"] = corpus["body"].astype(str).apply(lambda x: lemmatize(x))
corpus['year'] = corpus['date'].apply(lambda x: x.split(" ")[1])

sp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def clean_text(txt):
    return str(txt).split("ACKNOWLEDGMENTS")[0]

texts = (clean_text(text) for text in corpus["body"])

lemmatized_docs = []
print("Starting processing...")

for doc in sp.pipe(texts, batch_size=50, n_process=1):
    lemmas = [
        token.lemma_.lower() 
        for token in doc 
        if not token.is_stop and not token.is_punct and token.lemma_.strip() != ''
    ]
    lemmatized_docs.append(" ".join(lemmas))

corpus["text_preproc"] = lemmatized_docs
corpus['year'] = corpus['date'].apply(lambda x: x.split(" ")[1])

### 3. Generating the Term Frequency Inverse Document Frequency

In [36]:
my_stop_words = [
    'overview', 'content', 'pdf', 'download', 'spanish', 'rights', 'reserved', 
    'click', 'view', 'paper', 'working', 'series', 'copyright', 'www', 'http',
    'association cfr international', 'association cfr', 'produce association cfr'
]

vectorizer = TfidfVectorizer(
    ngram_range=(1,3),
    min_df=3,
    max_df=0.4,
    token_pattern=r"(?u)\b\w+\b",
    stop_words=my_stop_words,
    lowercase=False
)

X = vectorizer.fit_transform(corpus['text_preproc'])
feature_names = vectorizer.get_feature_names_out()
print(f'# Documents: {X.shape[0]}\n# Terms: {X.shape[1]}\n')

unique_years = sorted(corpus['year'].unique())

print("--- Top Distinctive Words by Year ---")

top5 = []

for year in unique_years:
    mask = (corpus['year'] == year).values
    X_year = X[mask]
    avg_tfidf = X_year.mean(axis=0)
    avg_tfidf = np.asarray(avg_tfidf).flatten()
    top_indices = avg_tfidf.argsort()[::-1][:5]
    top_words = [feature_names[i] for i in top_indices]

    top5.append({"year": year, "n_documents": mask.sum(), "top_words": top_words})
    
    print(f"{year} ({mask.sum()} documents): {', '.join(top_words)}")

pd.DataFrame(top5).to_csv('output/top5_words_per_year.csv')



# Documents: 697
# Terms: 37838

--- Top Distinctive Words by Year ---
1996 (1 documents): sudden, flow develop, import substitution, difference, monetary
1997 (2 documents): religion, islam, china, religious, iranian
1998 (8 documents): japan, trade, alliance, asian, asia
1999 (9 documents): trade, financial, north, space, market
2000 (21 documents): colombia, japan, china, defense, islamist
2001 (9 documents): afghanistan, food, border, gm, genetically
2002 (10 documents): egypt, saudi, egyptian, corporate governance, mena
2003 (6 documents): iraq, iraqi, periodical, papua, reform
2004 (8 documents): philippines, girl, darfur, aids, georgia
2005 (8 documents): america, opinion leader, hiv, iraq, american
2006 (13 documents): patent, turkey, trade, nuclear, turkish
2007 (17 documents): iraq, africa, bolivia, angola, pakistan
2008 (21 documents): education, nuclear, iran, space, assurance
2009 (33 documents): financial, imbalance, center geoeconomic studies, center geoeconomic, geoecon

### 4. Exporting the TFIDF

In [39]:
TOP_N_WORDS = 200
OUTPUT_FILE = "output/tfidf_data.json"

print("Generating time-series data...")

years = sorted(corpus['year'].unique())[2:-1]

word_data = {word: [] for word in feature_names}

for year in years:
    mask = (corpus['year'] == year).values
    if mask.sum() > 0:
        avg_vector = np.asarray(X[mask].mean(axis=0)).flatten()
        
        avg_vector = np.round(avg_vector * 1000, 3)
        
        for i, word in enumerate(feature_names):
            word_data[word].append(float(avg_vector[i]))
    else:
        for word in feature_names:
            word_data[word].append(0.0)

total_scores = {k: sum(v) for k, v in word_data.items()}
sorted_words = sorted(total_scores, key=total_scores.get, reverse=True)[:TOP_N_WORDS]

final_data = {
    "years": list(years),
    "series": {word: word_data[word] for word in sorted_words}
}

with open(OUTPUT_FILE, 'w') as f:
    json.dump(final_data, f)

print(f"Successfully saved {len(sorted_words)} words to {OUTPUT_FILE}")

Generating time-series data...
Successfully saved 200 words to output/tfidf_data.json
