In [1]:
# Import Libraries and Load Data

import pandas as pd
import re
import itertools
import spacy

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")

In [2]:
# === Load Data ===
# Load country list
countries_df = pd.read_csv('countries_list_20th_century_1.5.csv')
country_list = countries_df['country_name'].str.strip().str.lower().tolist()

# Load raw event text
with open('20th_century_key_events.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

In [3]:
# === STEP 1: TEXT WRANGLING (Standardization) ===

replacement_dict = {
    "u.s.": "united states",
    "us": "united states",
    "u.k.": "united kingdom",
    "uk": "united kingdom",
    "soviet union": "russia",
    "soviet": "russia",
    "german": "germany",
    "british": "united kingdom",
    "french": "france",
    "italian": "italy",
    "japanese": "japan",
    "chinese": "china",
}

# Normalize text
normalized_text = raw_text.lower()
for old, new in replacement_dict.items():
    normalized_text = re.sub(rf'\b{re.escape(old)}\b', new, normalized_text)

# Save wrangled version for traceability
with open('20th_century_wrangled_text.txt', 'w', encoding='utf-8') as f:
    f.write(normalized_text)

In [4]:
# === STEP 2: Apply NER ===
doc = nlp(normalized_text)

In [5]:
# === STEP 3: Extract Sentences with Countries Only ===

filtered_sentences = []
for sent in doc.sents:
    sentence_text = sent.text.strip()
    countries_in_sentence = set()
    
    for ent in sent.ents:
        if ent.label_ == "GPE":
            ent_clean = ent.text.strip().lower()
            if ent_clean in country_list:
                countries_in_sentence.add(ent_clean.title())  # Title-case for consistency

    if len(countries_in_sentence) > 0:
        filtered_sentences.append({
            "sentence": sentence_text,
            "country_entities": list(countries_in_sentence)
        })

filtered_sentences_df = pd.DataFrame(filtered_sentences)
print(f"Filtered to {len(filtered_sentences_df)} sentences containing at least 1 country.")
display(filtered_sentences_df.head())

Filtered to 160 sentences containing at least 1 country.


Unnamed: 0,sentence,country_entities
0,after a period of diplomatic and military esca...,"[Germany, France, Russia]"
1,the bolsheviks negotiated the treaty of brest-...,"[Russia, Germany]"
2,"in the treaty, bolshevik russia ceded the balt...","[Russia, Germany]"
3,it also recognized the independence of ukraine...,[Germany]
4,combined with already existing malnourishment ...,[Russia]


In [6]:
# === STEP 4: Create Country Relationship Pairs (Only 2+ countries) ===

relationships = []
for entry in filtered_sentences:
    countries = entry['country_entities']
    if len(countries) >= 2:
        for pair in itertools.combinations(sorted(countries), 2):
            relationships.append(pair)

In [7]:
# === STEP 5: Final Relationship Table with Counts ===
relationships_df = pd.DataFrame(relationships, columns=["source", "target"])

# Count pair frequencies
final_relationships_df = relationships_df.value_counts().reset_index()
final_relationships_df.columns = ["source", "target", "value"]


In [8]:
# Save final CSV
final_relationships_df.to_csv("country_relationships.csv", index=False)
print("\nFinal country relationship data saved as 'country_relationships.csv'.")
display(final_relationships_df.head(15))


Final country relationship data saved as 'country_relationships.csv'.


Unnamed: 0,source,target,value
0,Germany,Russia,12
1,Japan,Russia,6
2,Poland,Russia,6
3,France,Russia,5
4,Germany,Poland,5
5,France,Germany,5
6,Germany,Italy,4
7,Germany,Japan,3
8,India,Pakistan,3
9,France,Poland,2
