# DERBi PIE Semantics Module

In [4]:
import numpy as np

In [5]:
from scipy import spatial

In [11]:
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [22]:
import gutenbergpy.textget

In [89]:
import matplotlib.pyplot as plt

In [43]:
def getbook(book, outfile):
  """
  Download a book from project Gutenberg and save it 
  to the specified outfile
  """
  print(f"Downloading Project Gutenberg ID {book}")
  raw_book = gutenbergpy.textget.get_text_by_id(book)
  clean_book = gutenbergpy.textget.strip_headers(raw_book)
  if not outfile:
    outfile = f'{book}.txt'
    print(f"Saving book as {outfile}")
  with open(outfile, 'wb') as file:
    file.write(clean_book)
    file.close()

In [71]:
# these are Project Gutenberg IDs and information
caesar_ids = [218, 18837, 29645]
caesar_files = [f"C:/Users/gpwal/derbipie-semantics/texts/{x}.txt" for x in caesar_ids]
caesar_titles = ["Bello Gallico I-IV", "Bello Gallico V-VII", "The Gate to Caesar"]
cicero_ids = [14970, 226, 47001]
cicero_files = [f"C:/Users/gpwal/derbipie-semantics/texts/{x}.txt" for x in cicero_ids]
cicero_titles = ["Academia", "Orations", "De Officiis"]

In [68]:
foo = [getbook(x, f"C:/Users/gpwal/derbipie-semantics/texts/{x}.txt") for x in caesar_ids]
foo = [getbook(x, f"C:/Users/gpwal/derbipie-semantics/texts/{x}.txt") for x in cicero_ids]

Downloading Project Gutenberg ID 218
Downloading Project Gutenberg ID 18837
Downloading Project Gutenberg ID 29645
Downloading Project Gutenberg ID 14970
Downloading Project Gutenberg ID 226
Downloading Project Gutenberg ID 47001


In [75]:
def get_unigram_counts(path):
  """
    Given a path, generate a counter dictionary of unigrams
  """
  with open(path, 'r', encoding="utf-8") as f:
    text = f.read()
  text = text.replace("\n", " ").lower()
  unigrams = RegexpTokenizer(r"\w+").tokenize(text)
  count = Counter(unigrams)
  return(count)

In [76]:
caesar_words = {k:get_unigram_counts(v) 
                    for k, v in zip(caesar_titles, caesar_files)}
cicero_words = {k:get_unigram_counts(v) 
                    for k, v in zip(cicero_titles, cicero_files)}

In [80]:
caesar_words.keys()
caesar_words["Bello Gallico I-IV"].most_common(10)

[('in', 509),
 ('et', 479),
 ('ad', 317),
 ('cum', 244),
 ('quod', 224),
 ('se', 196),
 ('ex', 194),
 ('qui', 172),
 ('ut', 171),
 ('non', 170)]

In [81]:
def get_term_count(book_dict, term):
  """
    return a list of the number of times a term has appeared
    in a book
  """
  out = [book_dict[book][term] for book in book_dict]
  return(out)

In [82]:
ego = ["ego"] + \
          get_term_count(caesar_words, "ego") + \
          get_term_count(cicero_words, "ego")
magnus  = ["magnus"] + \
          get_term_count(caesar_words, "magnus") + \
          get_term_count(cicero_words, "magnus")

In [84]:
print(ego)
print(magnus)

['ego', 1, 4, 1, 29, 61, 10]
['magnus', 0, 10, 0, 4, 0, 5]


In [91]:
bellogallico1 = np.array([1, 0])
bellogallico2 = np.array([4, 0])
gateofcaesar = np.array([1, 10])
academia = np.array([29, 4])
orations = np.array([61, 0])
deofficiis = np.array([10, 5])

In [97]:
dot = np.dot(bellogallico1, bellogallico2)
print(dot)

4
