# 1.6 20th Century NLP and Network Analysis

## Import Libraries and Load Data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re



In [2]:
# Load data
with open('20th_century_wiki.txt', 'r', errors = 'ignore') as file:
    data = file.read().replace('\n', '')

## Create NER Object

In [3]:
# Download English module
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

In [5]:
book = NER(data)

## Split Sentence Entities

In [6]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

In [7]:
# Put sentences into df
df_sentences = pd.DataFrame(df_sentences)

In [8]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Contribute,..."
1,"(depression1.2.2The, rise, of, dictatorship1.3...","[World War II, Pacific1.3.7.1Background1.3.8Ja..."
2,"(begins1.4The, post, -, war, world1.4.1The, en...","[Cold War, 1947–1991)1.4.3War]"
3,"(race1.4.5The, end, of, the, Cold, War1.4.6Inf...","[the Cold War1.4.6Information, 20th, pageGet, ..."
4,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
5,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
6,"(Events, in, the, 20th, century[edit]The, worl...",[the 20th]
7,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]"
8,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
9,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"


## Filter Entities for Country Names Only

In [9]:
# Load country names
country_df = pd.read_csv('countries_list_20th_century_1.5.csv', index_col = 0)

In [10]:
# Remove spaces before and after country names
country_df['country_name'] = country_df['country_name'].str.strip()

In [11]:
country_df.head(5)

Unnamed: 0,country_name
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


In [12]:
# Function to filter out entities not of interest
def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['country_name'])]

In [13]:
# Check that it works
filter_entity(["Germany", "CF", "2"], country_df)

['Germany']

In [14]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [15]:
df_sentences['country_entities'].head(20)

0                                                []
1                                                []
2                                                []
3                                                []
4                                                []
5                                                []
6                                                []
7                                                []
8                                                []
9                                                []
10                                               []
11                                               []
12                                               []
13                                               []
14                          [France, Italy, Russia]
15    [Germany, Austria, Hungary, Bulgaria, Russia]
16                                [Germany, Russia]
17                                        [Germany]
18                                        [Germany]
19          

In [16]:
# Filter out sentences that don't have any character entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [17]:
df_sentences_filtered.head(5)

Unnamed: 0,sentence,entities,country_entities
14,"(Interwoven, alliances, ,, an, increasing, arm...","[Europe, Allies, The Triple Entente, British E...","[France, Italy, Russia]"
15,"(Germany, ,, Austria, -, Hungary, ,, Bulgaria,...","[Germany, Austria, Hungary, Bulgaria, the Otto...","[Germany, Austria, Hungary, Bulgaria, Russia]"
16,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[the Treaty of Brest-Litovsk, Germany, Russia]","[Germany, Russia]"
17,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
18,"(It, also, recognized, the, independence, of, ...","[Germany, Allied, American]",[Germany]


## Create Relationships Dataframe

In [18]:
# Defining relationships 

relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + 5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i == 0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({'source': a, 'target': b})

In [19]:
relationship_df = pd.DataFrame(relationships)

In [20]:
relationship_df.head()

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,France,Italy
3,Italy,Russia
4,Russia,Germany


In [21]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Italy
1,Italy,Russia
2,France,Italy
3,Italy,Russia
4,Germany,Russia


In [22]:
# Summarize the interactions
relationship_df['value'] = 1
relationship_df = relationship_df.groupby(['source', 'target'], sort = False, as_index = False).sum()

In [23]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Italy,10
1,Italy,Russia,6
2,Germany,Russia,26
3,Austria,Germany,17
4,Austria,Hungary,6
5,Bulgaria,Hungary,6
6,Bulgaria,Russia,6
7,Germany,Italy,26
8,Germany,Spain,1
9,France,Spain,1


In [24]:
relationship_df.to_csv('20th_century_relationship.csv')