In [1]:
# --- Imports ---
# Loading the main libraries we’ll use for this task.
# Mostly pandas and numpy for data, spaCy for NLP, and a few others for plotting and graphs.

import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

In [4]:
# Load the 20th-century text file
with open('/Users/emilsafarov/Library/CloudStorage/OneDrive-Personal/CF/CF_S1/20th-century/Scraped_data/timeline_20th_century_v4.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', ' ')

In [5]:
# Apply the NER algorithm to the data
ner_content = NER(data)

In [6]:
# Add spaces after commas and periods if missing
data_clean = re.sub(r'(?<=[a-zA-Z])(?=[A-Z])', ' ', data)      # e.g. UnitedStates → United States
data_clean = re.sub(r'(?<=\d)(?=[A-Z])', ' ', data_clean)      # e.g. 1900sJanuary → 1900s January
data_clean = re.sub(r'(?<=[a-z])(?=\d)', ' ', data_clean)      # e.g. Warends1905 → War ends 1905

# Re-run NER
ner_content = NER(data_clean)

In [7]:
# Visualize the first 1000 characters with entities
displacy.render(ner_content[0:200], style="ent", jupyter=True)

In [8]:
# Tokenize the NER-processed content into individual sentences
sentences = list(ner_content.sents)

In [9]:
# Create a list of entities for each sentence
sentence_entities = [ [ent.text for ent in sent.ents] for sent in sentences ]

In [10]:
df_sentences = []

for sent in ner_content.sents:
    # Keep all named entities for now — we’ll filter by country name and alias later
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [11]:
# Preview
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(1900s, -, January, 1, :, The, Australian, col...","[1900s - January 1, Australian, 1900s - Januar..."
1,"(1900s, -, March, 2, :, The, Platt, Amendmentp...","[1900s - March 2, The Platt Amendmentprovides,..."
2,"(1900s, -, June, :, Emily, Hobhousereports, on...","[1900s - June, 45, British, Boerwomen, South A..."
3,"(1900s, -, September, 6, :, The, assassination...","[1900s - September 6, William Mc Kinleyushered..."
4,"(1900s, -, September, 7, :, The, Eight, -, Nat...","[1900s - September 7, China]"
5,"(1900s, -, December, 10, :, First, Nobel, Priz...","[1900s - December 10, 1900s - December 12, first]"
6,"(1900s, -, January, 13, :, The, Unification, o...","[1900s - January 13, The Unification of Saudi ..."
7,"(1900s, -, May, 20, :, Cubagiven, independence...","[1900s - May 20, the United States]"
8,"(1900s, -, May, 31, :, Second, Boer, Warends, ...","[1900s - May 31, Second Boer Warends, British]"
9,"(1900s, -, July, 12, :, Arthur, Balfourbecomes...","[1900s - July 12, Arthur Balfourbecomes, the U..."


In [12]:
# Preview the first 100 named entities
print(ner_content.ents[:100])

(1900s - January 1, Australian, 1900s - January 22, Edward V I Ibecame King of England, India, Queen Victoria's, 1900s - March 2, The Platt Amendmentprovides, Cuban, American, 1900s - June, 45, British, Boerwomen, South Africa, 1900s - September 6, William Mc Kinleyushered, Theodore Rooseveltafter, Kinley, September 14, 1900s - September 7, China, 1900s - December 10, 1900s - December 12, first, 1900s - January 13, The Unification of Saudi Arabiabegins, 1900s - May 20, the United States, 1900s - May 31, Second Boer Warends, British, 1900s - July 12, Arthur Balfourbecomes, the United Kingdom, 1900s - July 17, Willis Carrierinvents, first, Venezuelan, 1902–1903, Britain, Germany, Italy, Venezuelain, 1900s - February 15, 1900s - June 11, Serbiaand, Queen Dragaare, 1900s - July 1, first, Tour de Franceis, 1900s - July, August, the Russian Social Democratic Labour Party, 1900s - August 4, Xbecomes Pope, 1900s - November 18, Panama, the United States, Panama, 1900s - December 17, First, Wrig

In [13]:
# Apply extra cleaning to fix merged words like "Serbiaand" or "VIIbecame"
# Split words joined with lowercase-uppercase or number-uppercase
data_clean = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', data_clean)
data_clean = re.sub(r'(?<=[A-Za-z])(?=\d)', ' ', data_clean)
data_clean = re.sub(r'(?<=\d)(?=[A-Za-z])', ' ', data_clean)
data_clean = re.sub(r'(?<=[a-z])(?=[A-Z][a-z])', ' ', data_clean)

# Fix fused words like "Serbiaand", "Venezuelain", etc.
data_clean = re.sub(r'\b(\w+)(and|in|of)\b', r'\1', data_clean)

In [14]:
# Filter out noisy entities like numbers or short non-country words (e.g., "first", "1900s")

def clean_entities(entity_list):
    return [ent for ent in entity_list if ent.isalpha() and len(ent) > 2]

In [15]:
# Print only the first 30 named entities that are labeled as GPE or LOC
count = 0
for ent in ner_content.ents:
    if ent.label_ in ["GPE", "LOC"]:
        print(ent.text, ent.label_)
        count += 1
    if count == 10:
        break

India GPE
South Africa GPE
China GPE
the United States GPE
the United Kingdom GPE
Britain GPE
Germany GPE
Italy GPE
Venezuelain GPE
Serbiaand GPE


In [16]:
# Preview the first 100 named entities # recheck
print(ner_content.ents[:100])

(1900s - January 1, Australian, 1900s - January 22, Edward V I Ibecame King of England, India, Queen Victoria's, 1900s - March 2, The Platt Amendmentprovides, Cuban, American, 1900s - June, 45, British, Boerwomen, South Africa, 1900s - September 6, William Mc Kinleyushered, Theodore Rooseveltafter, Kinley, September 14, 1900s - September 7, China, 1900s - December 10, 1900s - December 12, first, 1900s - January 13, The Unification of Saudi Arabiabegins, 1900s - May 20, the United States, 1900s - May 31, Second Boer Warends, British, 1900s - July 12, Arthur Balfourbecomes, the United Kingdom, 1900s - July 17, Willis Carrierinvents, first, Venezuelan, 1902–1903, Britain, Germany, Italy, Venezuelain, 1900s - February 15, 1900s - June 11, Serbiaand, Queen Dragaare, 1900s - July 1, first, Tour de Franceis, 1900s - July, August, the Russian Social Democratic Labour Party, 1900s - August 4, Xbecomes Pope, 1900s - November 18, Panama, the United States, Panama, 1900s - December 17, First, Wrig

### Observations on Country Matching and Wrangling
Before applying Named Entity Recognition (NER), I compared the timeline text with the official country list and found several mismatches.

Some country names appeared in merged or malformed forms, such as Venezuelain, Serbiaand, and theUnitedStates. These would not match the country list directly.

### Actions Taken:
Used regex to fix spacing and remove suffixes like "in", "and", or "of" attached to country names.
Re-ran NER on the cleaned text.
Split the text into sentences and extracted all named entities without filtering. This keeps demonyms like "Australian" and merged tokens for later country matching.
Verified that cleaned entities now align better with the country list, either directly or via alias mapping.
This wrangling step was necessary to ensure accurate filtering and country relationship extraction.

In [17]:
# Loading country names 

country_df = pd.read_csv("/Users/emilsafarov/Library/CloudStorage/OneDrive-Personal/CF/CF_S1/20th-century/Scraped_data/countries_list.txt")

In [18]:
country_df.shape

(195, 1)

In [19]:
# This dictionary maps nationality adjectives or demonyms (e.g., "German", "Brazilian") 
# to their corresponding official country names. It is useful for standardizing references 
# to countries when parsing text data that mentions nationalities instead of country names. 
# For example, when performing entity matching or building geographic visualizations, 
# we can convert "American" to "United States" for consistency.

country_alias_map = {
    "Afghan": "Afghanistan",
    "Albanian": "Albania",
    "Algerian": "Algeria",
    "Andorran": "Andorra",
    "Angolan": "Angola",
    "Antiguan": "Antigua and Barbuda",
    "Argentine": "Argentina",
    "Armenian": "Armenia",
    "Australian": "Australia",
    "Austrian": "Austria",
    "Azerbaijani": "Azerbaijan",
    "Bahamian": "Bahamas",
    "Bahraini": "Bahrain",
    "Bangladeshi": "Bangladesh",
    "Barbadian": "Barbados",
    "Belarusian": "Belarus",
    "Belgian": "Belgium",
    "Belizean": "Belize",
    "Beninese": "Benin",
    "Bhutanese": "Bhutan",
    "Bolivian": "Bolivia",
    "Bosnian": "Bosnia and Herzegovina",
    "Botswanan": "Botswana",
    "Brazilian": "Brazil",
    "Bruneian": "Brunei",
    "Bulgarian": "Bulgaria",
    "Burkinabé": "Burkina Faso",
    "Burundian": "Burundi",
    "Cabo Verdean": "Cabo Verde",
    "Cambodian": "Cambodia",
    "Cameroonian": "Cameroon",
    "Canadian": "Canada",
    "Central African": "Central African Republic",
    "Chadian": "Chad",
    "Chilean": "Chile",
    "Chinese": "China",
    "Colombian": "Colombia",
    "Comorian": "Comoros",
    "Costa Rican": "Costa Rica",
    "Ivorian": "Côte d'Ivoire",
    "Croatian": "Croatia",
    "Cuban": "Cuba",
    "Cypriot": "Cyprus",
    "Czech": "Czechia",
    "Congolese": "Democratic Republic of the Congo",  # use "Republic of the Congo" for other
    "Danish": "Denmark",
    "Djiboutian": "Djibouti",
    "Dominican": "Dominican Republic",
    "Ecuadorian": "Ecuador",
    "Egyptian": "Egypt",
    "Salvadoran": "El Salvador",
    "Equatorial Guinean": "Equatorial Guinea",
    "Eritrean": "Eritrea",
    "Estonian": "Estonia",
    "Eswatini": "Eswatini",
    "Ethiopian": "Ethiopia",
    "Fijian": "Fiji",
    "Finnish": "Finland",
    "French": "France",
    "Gabonese": "Gabon",
    "Gambian": "Gambia",
    "Georgian": "Georgia",
    "German": "Germany",
    "Ghanaian": "Ghana",
    "Greek": "Greece",
    "Grenadian": "Grenada",
    "Guatemalan": "Guatemala",
    "Guinean": "Guinea",
    "Bissau-Guinean": "Guinea-Bissau",
    "Guyanese": "Guyana",
    "Haitian": "Haiti",
    "Honduran": "Honduras",
    "Hungarian": "Hungary",
    "Icelandic": "Iceland",
    "Indian": "India",
    "Indonesian": "Indonesia",
    "Iranian": "Iran",
    "Iraqi": "Iraq",
    "Irish": "Ireland",
    "Israeli": "Israel",
    "Italian": "Italy",
    "Jamaican": "Jamaica",
    "Japanese": "Japan",
    "Jordanian": "Jordan",
    "Kazakh": "Kazakhstan",
    "Kenyan": "Kenya",
    "Kiribati": "Kiribati",
    "Kuwaiti": "Kuwait",
    "Kyrgyz": "Kyrgyzstan",
    "Laotian": "Laos",
    "Latvian": "Latvia",
    "Lebanese": "Lebanon",
    "Basotho": "Lesotho",
    "Liberian": "Liberia",
    "Libyan": "Libya",
    "Liechtensteiner": "Liechtenstein",
    "Lithuanian": "Lithuania",
    "Luxembourgish": "Luxembourg",
    "Malagasy": "Madagascar",
    "Malawian": "Malawi",
    "Malaysian": "Malaysia",
    "Maldivian": "Maldives",
    "Malian": "Mali",
    "Maltese": "Malta",
    "Marshallese": "Marshall Islands",
    "Mauritanian": "Mauritania",
    "Mauritian": "Mauritius",
    "Mexican": "Mexico",
    "Micronesian": "Micronesia",
    "Moldovan": "Moldova",
    "Monegasque": "Monaco",
    "Mongolian": "Mongolia",
    "Montenegrin": "Montenegro",
    "Moroccan": "Morocco",
    "Mozambican": "Mozambique",
    "Burmese": "Myanmar",
    "Namibian": "Namibia",
    "Nauruan": "Nauru",
    "Nepali": "Nepal",
    "Dutch": "Netherlands",
    "New Zealander": "New Zealand",
    "Nicaraguan": "Nicaragua",
    "Nigerien": "Niger",
    "Nigerian": "Nigeria",
    "North Macedonian": "North Macedonia",
    "North Korean": "North Korea",
    "Norwegian": "Norway",
    "Omani": "Oman",
    "Pakistani": "Pakistan",
    "Palauan": "Palau",
    "Palestinian": "Palestine",
    "Panamanian": "Panama",
    "Papua New Guinean": "Papua New Guinea",
    "Paraguayan": "Paraguay",
    "Peruvian": "Peru",
    "Filipino": "Philippines",
    "Polish": "Poland",
    "Portuguese": "Portugal",
    "Qatari": "Qatar",
    "Congolese (Republic)": "Republic of the Congo",
    "Romanian": "Romania",
    "Russian": "Russia",
    "Rwandan": "Rwanda",
    "Saint Kitts and Nevis": "Saint Kitts and Nevis",
    "Saint Lucian": "Saint Lucia",
    "Vincentian": "Saint Vincent and the Grenadines",
    "Samoan": "Samoa",
    "San Marinese": "San Marino",
    "São Toméan": "São Tomé and Príncipe",
    "Saudi": "Saudi Arabia",
    "Senegalese": "Senegal",
    "Serbian": "Serbia",
    "Seychellois": "Seychelles",
    "Sierra Leonean": "Sierra Leone",
    "Singaporean": "Singapore",
    "Slovak": "Slovakia",
    "Slovene": "Slovenia",
    "Solomon Islander": "Solomon Islands",
    "Somali": "Somalia",
    "South African": "South Africa",
    "South Korean": "South Korea",
    "South Sudanese": "South Sudan",
    "Spanish": "Spain",
    "Sri Lankan": "Sri Lanka",
    "Sudanese": "Sudan",
    "Surinamese": "Suriname",
    "Swazi": "Eswatini",
    "Swedish": "Sweden",
    "Swiss": "Switzerland",
    "Syrian": "Syria",
    "Tajik": "Tajikistan",
    "Tanzanian": "Tanzania",
    "Thai": "Thailand",
    "Timorese": "Timor Leste",
    "Togolese": "Togo",
    "Tongan": "Tonga",
    "Trinidadian": "Trinidad and Tobago",
    "Tunisian": "Tunisia",
    "Turkish": "Türkiye",
    "Turkmen": "Turkmenistan",
    "Tuvaluan": "Tuvalu",
    "Ugandan": "Uganda",
    "Ukrainian": "Ukraine",
    "Emirati": "United Arab Emirates",
    "British": "United Kingdom",
    "American": "United States",
    "Uruguayan": "Uruguay",
    "Uzbek": "Uzbekistan",
    "Ni-Vanuatu": "Vanuatu",
    "Vatican": "Vatican City",
    "Venezuelan": "Venezuela",
    "Vietnamese": "Vietnam",
    "Yemeni": "Yemen",
    "Zambian": "Zambia",
    "Zimbabwean": "Zimbabwe", 
    "States": "United States", 
}

In [20]:
# Direct match: keep only countries that match exactly
def get_direct_countries(ent_list, country_df):
    country_names = set(country_df['sovereign states'])
    return [ent for ent in ent_list if ent in country_names]

# Alias match: convert demonyms to country names using alias map
def get_mapped_countries(ent_list, country_df, country_alias_map):
    country_names = set(country_df['sovereign states'])
    return [
        country_alias_map[ent]
        for ent in ent_list
        if ent in country_alias_map and country_alias_map[ent] in country_names
    ]

In [21]:
# Apply direct country match
df_sentences['country_direct'] = df_sentences['entities'].apply(
    lambda x: get_direct_countries(x, country_df)
) 

In [22]:
# Apply alias mapping

In [23]:
df_sentences['country_mapped'] = df_sentences['entities'].apply(
    lambda x: get_mapped_countries(x, country_df, country_alias_map)
)

In [24]:
# Combine direct matches and alias-based matches
df_sentences['country_combined'] = df_sentences['country_direct'] + df_sentences['country_mapped']

In [25]:
df_sentences.head()

Unnamed: 0,sentence,entities,country_direct,country_mapped,country_combined
0,"(1900s, -, January, 1, :, The, Australian, col...","[1900s - January 1, Australian, 1900s - Januar...",[India],[Australia],"[India, Australia]"
1,"(1900s, -, March, 2, :, The, Platt, Amendmentp...","[1900s - March 2, The Platt Amendmentprovides,...",[],"[Cuba, United States]","[Cuba, United States]"
2,"(1900s, -, June, :, Emily, Hobhousereports, on...","[1900s - June, 45, British, Boerwomen, South A...",[South Africa],[United Kingdom],"[South Africa, United Kingdom]"
3,"(1900s, -, September, 6, :, The, assassination...","[1900s - September 6, William Mc Kinleyushered...",[],[],[]
4,"(1900s, -, September, 7, :, The, Eight, -, Nat...","[1900s - September 7, China]",[China],[],[China]


In [26]:
# Keep only sentences that mention at least one country
df_sentences_filtered = df_sentences[df_sentences['country_combined'].map(len) > 0]

In [27]:
df_sentences_filtered.head()

Unnamed: 0,sentence,entities,country_direct,country_mapped,country_combined
0,"(1900s, -, January, 1, :, The, Australian, col...","[1900s - January 1, Australian, 1900s - Januar...",[India],[Australia],"[India, Australia]"
1,"(1900s, -, March, 2, :, The, Platt, Amendmentp...","[1900s - March 2, The Platt Amendmentprovides,...",[],"[Cuba, United States]","[Cuba, United States]"
2,"(1900s, -, June, :, Emily, Hobhousereports, on...","[1900s - June, 45, British, Boerwomen, South A...",[South Africa],[United Kingdom],"[South Africa, United Kingdom]"
4,"(1900s, -, September, 7, :, The, Eight, -, Nat...","[1900s - September 7, China]",[China],[],[China]
8,"(1900s, -, May, 31, :, Second, Boer, Warends, ...","[1900s - May 31, Second Boer Warends, British]",[],[United Kingdom],[United Kingdom]


In [28]:
relationship_data = []

for row in df_sentences_filtered['country_combined']:
    unique_countries = list(set(row))  # remove duplicates in a sentence
    if len(unique_countries) > 1:
        for i in range(len(unique_countries)):
            for j in range(i + 1, len(unique_countries)):
                relationship_data.append({
                    'source': unique_countries[i],
                    'target': unique_countries[j]
                })

relationship_df = pd.DataFrame(relationship_data)


In [29]:
relationship_df

Unnamed: 0,source,target
0,Australia,India
1,United States,Cuba
2,South Africa,United Kingdom
3,Germany,Italy
4,Germany,Venezuela
...,...,...
139,Portugal,Russia
140,Lebanon,Israel
141,United States,Palestine
142,United States,Israel


In [30]:
# Group by country pairs and count how often they appear together
relationship_df = relationship_df.groupby(['source', 'target']).size().reset_index(name='count')

In [31]:
relationship_df.head(50)

Unnamed: 0,source,target,count
0,Albania,Cuba,1
1,Argentina,China,1
2,Argentina,India,1
3,Argentina,Lebanon,1
4,Armenia,United Kingdom,1
5,Australia,India,1
6,Australia,Russia,1
7,Australia,United Kingdom,1
8,Bahamas,Cuba,1
9,Bahamas,Israel,1


In [33]:
# Export the filtered relationship data to your local folder
relationship_df.to_csv(
    "/Users/emilsafarov/Library/CloudStorage/OneDrive-Personal/CF/CF_S1/20th-century/Scraped_data/country_relationships_v3.csv",
    index=False
)