In [21]:
text = "Siberia has many rivers."
for index, char in enumerate(text):
    print(index, char)

0 S
1 i
2 b
3 e
4 r
5 i
6 a
7  
8 h
9 a
10 s
11  
12 m
13 a
14 n
15 y
16  
17 r
18 i
19 v
20 e
21 r
22 s
23 .


In [3]:
text = "Siberia has many rivers."
text.find("rivers")

17

In [4]:
#some natural language processing in german
from spacy.lang.de import German
nlp = German()
doc = nlp("Berlin ist eine Stadt in Deutschland.")
for token in doc:
    print(token.i, token.text)

0 Berlin
1 ist
2 eine
3 Stadt
4 in
5 Deutschland
6 .


In [None]:
# print gazeteer, the list of places
from pathlib import Path

gazetteer = Path("gazetteer.txt").read_text()
gazetteer = gazetteer.split("\n")

print(gazetteer)

In [6]:
# Matching place names
# Find places in the text that occur in the gazetteer (our database)
from spacy.lang.de import German
from spacy.matcher import Matcher

nlp = German()

doc = nlp("Karl-Heinz Quade ist von März 1944 bis August 1948 im Lager 150 in Grjasowez interniert.")

matcher = Matcher(nlp.vocab)
for place in gazetteer:
    pattern = [{'LOWER': place.lower()}]
    matcher.add(place, [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    print(start, end, doc[start:end].text)

13 14 Grjasowez


In [7]:
# Case sensitive search
# Search for certain text followed by a number

pattern = [{'LOWER': 'lager'},  #the first token should be ‘lager’
           {'LIKE_NUM': True}] # the second token should be a number

# Add the pattern to the matcher
matcher.add("LAGER_PATTERN", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    print(start, end, doc[start:end].text)

10 12 Lager 150
13 14 Grjasowez


In [8]:
# Loading text files

# print filename, location, and name of place from the gazetteer that appears in
# the text
for file in Path('/Users/finntekverk/Desktop/everything/Tufts/Classes/Senior Year/First Semester/Digital Humanities/DHCode/ProgrammingHistorian/PH2/textfiles').iterdir():
    doc = nlp(file.read_text())
    matches = matcher(doc)
    for match_id, start, end in matches:
        print(file.name, start, end, doc[start:end].text)

gazetteerabridged.txt 0 1 Armenien
gazetteerabridged.txt 2 3 Aserbaidshan
gazetteerabridged.txt 4 5 Aserbaidshen
gazetteerabridged.txt 6 7 Estland
gazetteerabridged.txt 8 9 Georgien
gazetteerabridged.txt 10 11 Kasachstan
gazetteerabridged.txt 12 13 Kirgisien
gazetteerabridged.txt 14 15 Lettland
gazetteerabridged.txt 16 17 Litauen
gazetteerabridged.txt 18 19 Moldawien


In [9]:
# Term Frequency
#prints how many times phrases appear in text

from collections import Counter

# count how many times each term appears
count_list = []
for match_id, start, end in matches:
    count_list.append(doc[start:end].text)

counter = Counter(count_list)

#print ten most common phrases
for term, count in counter.most_common(10):
    print(term,count)

Armenien 1
Aserbaidshan 1
Aserbaidshen 1
Estland 1
Georgien 1
Kasachstan 1
Kirgisien 1
Lettland 1
Litauen 1
Moldawien 1


In [10]:
# Named Entity Recognition
import spacy

#use a pretrained model trained on german newspaper articles!
nlp = spacy.load("de_core_news_sm")

#identify entities! PER = Person, LOC = location
doc = nlp("Karl-Heinz Quade ist von März 1944 bis August 1948 im Lager 150 in Grjasowez interniert.")
for ent in doc.ents:
    print(ent.text, ent.label_, ent.start, ent.end)

Karl-Heinz Quade PER 0 2
Grjasowez LOC 13 14


In [11]:
# Print text in a displayed foramt with entity matching

from spacy import displacy
# displacy.serve(doc, style="ent")

#I used display.render to get rid of a warning!
displacy.render(doc, style="ent")

In [12]:
# Statistical model for relations between words

displacy.render(doc, jupyter=True, style="dep")

In [13]:
# from pathlib import Path

# svg = displacy.render(doc, style="dep")
# output_path = Path("sentence.svg")
# output_path.write_text(svg)

In [14]:
# Named Entity Linking

#connects person/place to a specific record in knowledge base
import spacy
nlp = spacy.load('de_core_news_sm')
nlp.add_pipe('dbpedia_spotlight', config={'language_code': 'de'})

# doc = nlp("Karl-Heinz Quade ist von März 1944 bis August 1948 im Lager 150 in Grjasowez interniert.")
# for ent in doc.ents:
#     print(ent.text, ent.label_, ent.kb_id_)


<spacy_dbpedia_spotlight.entity_linker.EntityLinker at 0x12cb0ff70>

In [None]:
import requests
data = requests.get("http://de.dbpedia.org/data/Grjasowez.json").json()

print(data)

In [16]:
# Export Our Data

start_date = "1800" #YYYY-MM-DD
end_date = "2000"
source_title = "Karl-Heinz Quade Diary"

output_text = ""
column_header = "id\ttitle\ttitle_source\tstart\tend\n"  
output_text += column_header  

places_list = []
if matches:
    places_list.extend([ doc[start:end].text for match_id, start, end in matches ])
if doc.ents:
    places_list.extend([ ent.text for ent in doc.ents if ent.label_ == "GPE" or ent.label_ == "LOC"])

# remove duplicate place names by creating a list of names and then converting the list to a set
unique_places = set(places_list)

for id, place in enumerate(unique_places):
    output_text += f"{id}\t{place}\t{source_title}\t{start_date}\t{end_date}\n"

filename = source_title.lower().replace(' ','_') + '.tsv'
Path(filename).write_text(output_text)
print('created: ', filename)

created:  karl-heinz_quade_diary.tsv
