In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 2.9/12.8 MB 21.0 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 27.9 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 27.6 MB/s  0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

Load Twentieth-century text file

In [4]:
with open('key_events_20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '').strip()

In [5]:
ountries_list = [
    'United States', 'Russia', 'United Kingdom', 'Germany',
    'France', 'China', 'Japan', 'Italy', 'India', 'Canada'
]


In [6]:
country_aliases = {
    'USA': 'United States',
    'America': 'United States',
    'USSR': 'Russia',
    'Soviet Union': 'Russia',
    'Britain': 'United Kingdom',
    'England': 'United Kingdom',
    'PRC': 'China',
    'Nippon': 'Japan'
}


In [7]:
for alias, standard in country_aliases.items():
    if alias in data:
        print(f"Found alias: {alias} → Replace with: {standard}")
        data = data.replace(alias, standard)

Found alias: USA → Replace with: United States
Found alias: America → Replace with: United States
Found alias: USSR → Replace with: Russia
Found alias: Soviet Union → Replace with: Russia
Found alias: Britain → Replace with: United Kingdom


In [8]:
with open('key_events_20th_century_cleaned.txt', 'w', encoding='utf-8') as file:
    file.write(data)


# Text Wrangling Observations
Country names in the text did not always match the standardized list. For example:
"USA" was replaced with "United States"
"America" was replaced with "United Sates"
"Soviet Union" and "USSR" were replaced with "Russia"
"Britain" and "England" were replaced with "United Kingdom"
These corrections were made to ensure consistency for Named Entity Recognition and network analysis.
The cleaned version of the text was saved as key_events_20th_century_cleaned.txt.

In [9]:
# Load Cleaned version 
with open('key_events_20th_century_cleaned.txt', 'r', encoding='utf-8') as file:
    cleaned_data = file.read()

In [10]:
book= NER(cleaned_data)

In [11]:
displacy.render(book[273:20000], style="ent", jupyter=True)

In [15]:
#Split the setence entities from the NER object
df_sentences = []

for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents] 
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)


In [16]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,The 20th century changed the world in unpreced...,[The 20th century]
1,The World Wars sparked tension between countri...,"[the Cold War, the Space Race]"
2,These advancements have played a significant r...,"[the 21st century, today]"
3,Historic events in the 20th century[edit]World...,[the 20th]
4,Edwardian eraThe new beginning of the 20th cen...,"[Edwardian, the 20th century]"
5,The 1900s saw the decade herald a series of in...,"[The 1900s, the decade]"
6,1914 saw the completion of the Panama Canal.,"[1914, the Panama Canal]"
7,The Scramble for Africa continued in the 1900s...,"[Scramble, Africa, the 1900s]"
8,The atrocities in the Congo Free State shocked...,[the Congo Free State]
9,"From 1914 to 1918, the First World War, and it...","[1914 to 1918, the First World War]"


In [18]:
#Load countries list 
df_countries= pd.read_csv(r"C:\Users\beaac\Desktop\20 Century\countries_list_20th_century_1.5 (1).csv")

In [21]:
print(df_countries.columns)


Index(['Unnamed: 0', 'country_name'], dtype='object')


In [22]:
countries = df_countries['country_name'].dropna().tolist()

In [33]:
print(countries[:10])  # Preview first 10 entries


[' Afghanistan ', '  Albania ', '  Algeria ', '  Andorra ', '  Angola ', '  Antigua and Barbuda ', '  Argentina ', '  Armenia ', '  Australia ', '  Austria ']


In [34]:
print(df_sentences.head(10))


                                            sentence  \
0  The 20th century changed the world in unpreced...   
1  The World Wars sparked tension between countri...   
2  These advancements have played a significant r...   
3  Historic events in the 20th century[edit]World...   
4  Edwardian eraThe new beginning of the 20th cen...   
5  The 1900s saw the decade herald a series of in...   
6       1914 saw the completion of the Panama Canal.   
7  The Scramble for Africa continued in the 1900s...   
8  The atrocities in the Congo Free State shocked...   
9  From 1914 to 1918, the First World War, and it...   

                              entities country_entities  
0                   [The 20th century]               []  
1       [the Cold War, the Space Race]               []  
2            [the 21st century, today]               []  
3                           [the 20th]               []  
4        [Edwardian, the 20th century]               []  
5              [The 1900s, the deca

In [35]:
countries = [c.strip().lower() for c in df_countries['country_name'].dropna()]


In [36]:
def filter_country_entities(ent_list, countries):
    return [ent for ent in ent_list if ent.strip().lower() in countries]



In [37]:
df_sentences["country_entities"] = df_sentences["entities"].apply(
    lambda x: filter_country_entities(x, countries)
)




In [38]:
df_sentences_filtered = df_sentences[df_sentences["country_entities"].map(len) > 0]



In [39]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
884,"""The Philippines, 1898â€“1946 | US House of Re...","[Philippines, 1898â€“1946, US, Art & Archives]",[Philippines]
906,"""Colonial Cartographies, Postcolonial Borders,...","[Colonial Cartographies, Enduring Failures of ...",[Afghanistan]
925,"""The Incorporation of the Baltic States by the...","[The Incorporation of the Baltic States, Russia]",[Russia]
935,"The Moldovans: Romania, Russia, and the Politi...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
981,"""Selling 'Operation Passage to Freedom': Dr. T...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1004,"""Stuck in Endless Preliminaries: Vietnam and t...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1092,"""The forgotten story of how the Baltic states ...","[Baltic, Russia]",[Russia]
1206,"""Anti-United Statesn Behavior in the Middle Ea...","[the Middle East, a Field Experiment, Lebanon]",[Lebanon]
1210,The Rise of China and India: A New Asian Drama.,"[The Rise of China, India]",[India]
1211,Singapore: World Scientific.,[Singapore],[Singapore]


 Create Relationships

In [42]:
# Define window size (e.g., 5 sentences at a time)
window_size = 5
relationships = []


for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + 5, df_sentences_filtered.index[-1])
    
    # Flatten country entities from the window
    country_list = sum(df_sentences_filtered.loc[i:end_i]["country_entities"], [])
    
    # Remove adjacent duplicates
    country_unique = [country_list[i] for i in range(len(country_list))
                      if i == 0 or country_list[i] != country_list[i - 1]]
    
    # Build pairwise relationships
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})



In [43]:
relationship_df = pd.DataFrame(relationships)

In [44]:
relationship_df

Unnamed: 0,source,target
0,Russia,France
1,Russia,France
2,France,Russia
3,Russia,Germany
4,Russia,France
...,...,...
800,India,Singapore
801,India,Singapore
802,India,Singapore
803,India,Singapore


In [45]:
# Sort the cases with a- >b and b- >a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,France,Russia
3,Germany,Russia
4,France,Russia


In [46]:
#Summarize interactions
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [47]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,23
1,Germany,Russia,26
2,Germany,Italy,12
3,Austria,Germany,11
4,Germany,Spain,1
5,Russia,Spain,4
6,Poland,Russia,40
7,France,Poland,11
8,France,United Kingdom,14
9,Poland,United Kingdom,11


In [48]:
relationship_df.to_csv('Countries_relationship.csv')