In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [2]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

### Load the twentieth-century text file.

In [4]:
# Load the 20th-century text file
with open('/Users/emilsafarov/Library/CloudStorage/OneDrive-Personal/CF/CF_S1/20th-century/Scraped_data/timeline_20th_century_v4.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', ' ')

In [5]:
# Apply the NER algorithm to the data
book = NER(data)

In [12]:
# Fix spacing after punctuation or numbers
import re

# Add spaces after commas and periods if missing
data_clean = re.sub(r'(?<=[a-zA-Z])(?=[A-Z])', ' ', data)  # split joined words like "UnitedStates" → "United States"
data_clean = re.sub(r'(?<=\d)(?=[A-Z])', ' ', data_clean)  # e.g. 1900sJanuary → 1900s January
data_clean = re.sub(r'(?<=[a-z])(?=\d)', ' ', data_clean)  # e.g. "Warends1905" → "War ends 1905"

# Re-run NER
book = NER(data_clean)

In [13]:
displacy.render(book[0:1000], style="ent", jupyter=True)

In [14]:
print(book.ents[:100])

(1900s - January 1, Australian, 1900s - January 22, Edward V I Ibecame King of England, India, Queen Victoria's, 1900s - March 2, The Platt Amendmentprovides, Cuban, American, 1900s - June, 45, British, Boerwomen, South Africa, 1900s - September 6, William Mc Kinleyushered, Theodore Rooseveltafter, Kinley, September 14, 1900s - September 7, China, 1900s - December 10, 1900s - December 12, first, 1900s - January 13, The Unification of Saudi Arabiabegins, 1900s - May 20, the United States, 1900s - May 31, Second Boer Warends, British, 1900s - July 12, Arthur Balfourbecomes, the United Kingdom, 1900s - July 17, Willis Carrierinvents, first, Venezuelan, 1902–1903, Britain, Germany, Italy, Venezuelain, 1900s - February 15, 1900s - June 11, Serbiaand, Queen Dragaare, 1900s - July 1, first, Tour de Franceis, 1900s - July, August, the Russian Social Democratic Labour Party, 1900s - August 4, Xbecomes Pope, 1900s - November 18, Panama, the United States, Panama, 1900s - December 17, First, Wrig

### Get named entity list per sentence

In [15]:
df_sentences = []

# Loop through each sentence in the text and collect its named entities
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

# Convert to a DataFrame
df_sentences = pd.DataFrame(df_sentences)

# Preview
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(1900s, -, January, 1, :, The, Australian, col...","[1900s - January 1, Australian, 1900s - Januar..."
1,"(1900s, -, March, 2, :, The, Platt, Amendmentp...","[1900s - March 2, The Platt Amendmentprovides,..."
2,"(1900s, -, June, :, Emily, Hobhousereports, on...","[1900s - June, 45, British, Boerwomen, South A..."
3,"(1900s, -, September, 6, :, The, assassination...","[1900s - September 6, William Mc Kinleyushered..."
4,"(1900s, -, September, 7, :, The, Eight, -, Nat...","[1900s - September 7, China]"
5,"(1900s, -, December, 10, :, First, Nobel, Priz...","[1900s - December 10, 1900s - December 12, first]"
6,"(1900s, -, January, 13, :, The, Unification, o...","[1900s - January 13, The Unification of Saudi ..."
7,"(1900s, -, May, 20, :, Cubagiven, independence...","[1900s - May 20, the United States]"
8,"(1900s, -, May, 31, :, Second, Boer, Warends, ...","[1900s - May 31, Second Boer Warends, British]"
9,"(1900s, -, July, 12, :, Arthur, Balfourbecomes...","[1900s - July 12, Arthur Balfourbecomes, the U..."


### Load country names

In [20]:
file_path = "/Users/emilsafarov/Library/CloudStorage/OneDrive-Personal/CF/CF_S1/20th-century/Scraped_data/countries_list.txt"

with open(file_path, 'r') as f:
    country_list = [line.strip() for line in f if line.strip() != '']

# Create DataFrame of country names
country_df = pd.DataFrame({'country_alias': country_list})

In [21]:
country_df

Unnamed: 0,country_alias
0,sovereign states
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
...,...
362,Adjara
363,Adygea
364,Altai
365,Bangsamoro


### Filter Named Entities by Country List

In [None]:
Filter Named Entities by Country List

In [22]:
# Define function to keep only countries in entity list
def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list if ent in list(country_df['country_alias'])]

# Apply to each sentence's entity list
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

# Keep only sentences that mention at least 1 country
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [23]:
df_sentences_filtered.head()

Unnamed: 0,sentence,entities,country_entities
0,"(1900s, -, January, 1, :, The, Australian, col...","[1900s - January 1, Australian, 1900s - Januar...",[India]
2,"(1900s, -, June, :, Emily, Hobhousereports, on...","[1900s - June, 45, British, Boerwomen, South A...",[South Africa]
4,"(1900s, -, September, 7, :, The, Eight, -, Nat...","[1900s - September 7, China]",[China]
11,"(1900s, -, Venezuelan, crisis, of, 1902–1903, ...","[Venezuelan, 1902–1903, Britain, Germany, Ital...","[Germany, Italy]"
16,"(1900s, -, August, 4, :, Pius, Xbecomes, Pope,...","[1900s - August 4, Xbecomes Pope, 1900s - Nove...","[Panama, Panama]"


### Create relationships

In [25]:
# Define how many sentences to look at together
window_size = 5
relationships = []

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + window_size, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i:end_i].country_entities), [])

    # Remove duplicates that are next to each other
    char_unique = [char_list[j] for j in range(len(char_list)) 
                   if j == 0 or char_list[j] != char_list[j - 1]]

    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [26]:
relationship_df = pd.DataFrame(relationships)
relationship_df.head(10)

Unnamed: 0,source,target
0,India,South Africa
1,South Africa,China
2,South Africa,China
3,South Africa,China
4,Germany,Italy
5,Germany,Italy
6,Germany,Italy
7,Germany,Italy
8,Germany,Italy
9,Germany,Italy


In [27]:
# Sort country pairs alphabetically (so "France–Germany" and "Germany–France" are treated the same)
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=relationship_df.columns)

# Add a value column for counting
relationship_df["value"] = 1

# Group and count occurrences of each country pair
relationship_df = relationship_df.groupby(["source", "target"], sort=False, as_index=False).sum()

# Preview
relationship_df.head(10)


Unnamed: 0,source,target,value
0,India,South Africa,1
1,China,South Africa,3
2,Germany,Italy,10
3,Italy,Panama,1
4,France,Panama,3
5,France,Russia,1
6,Norway,Russia,5
7,Chile,France,2
8,Chile,Cuba,6
9,Cuba,Russia,5


### Export Relationship Data

In [28]:
output_path = "/Users/emilsafarov/Library/CloudStorage/OneDrive-Personal/CF/CF_S1/20th-century/Scraped_data/country_relationships.csv"

relationship_df.to_csv(output_path, index=False)