In [1]:
#all imports
# from bokeh.plotting import figure, output_file, show
import gzip
import io
import pandas as pd
import urllib.request

%matplotlib inline

import matplotlib # plotting
import matplotlib.pyplot as plt

import sqlite3

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

#import pretty print

from pprint import pprint

In [2]:
# Get the Latin Library corpus

from cltk.corpus.latin import latinlibrary

In [3]:
# Imports for text preprocessing

import re # Regex module, useful for pattern matching
import html # Useful for handling entities

# Import/load a CLTK tool for normalizing i/j and u/v in Latin texts
from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

In [4]:
# Set up CLTK Latin word tokenizer

from cltk.tokenize.word import WordTokenizer
word_tokenizer = WordTokenizer('latin')

In [5]:
# We need to import a data model to train the lemmatizer.

import os
from cltk.utils.file_operations import open_pickle

# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)

In [6]:
# Set up CLTK Latin backoff lemmatizer

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

In [9]:
from collections import Counter

#importing table-generating tools
from collections import OrderedDict
from datetime import date

from IPython.display import display, HTML

In [10]:
#begin Martial file processing

In [11]:
#files in LL

ll_files = latinlibrary.fileids()
print(ll_files[:10])

['12tables.txt', '1644.txt', 'abbofloracensis.txt', 'abelard/dialogus.txt', 'abelard/epistola.txt', 'abelard/historia.txt', 'addison/barometri.txt', 'addison/burnett.txt', 'addison/hannes.txt', 'addison/machinae.txt']


In [12]:
#Martial data from LL
#find Martial files

martial_files = [file for file in ll_files if 'martial/' in file] 
martial_files = [file for file in martial_files if not 'spec' in file]
print(martial_files)

['martial/mart1.txt', 'martial/mart10.txt', 'martial/mart11.txt', 'martial/mart12.txt', 'martial/mart13.txt', 'martial/mart14.txt', 'martial/mart2.txt', 'martial/mart3.txt', 'martial/mart4.txt', 'martial/mart5.txt', 'martial/mart6.txt', 'martial/mart7.txt', 'martial/mart8.txt', 'martial/mart9.txt']


In [13]:
#making Martial file into a string

martial_raw = latinlibrary.raw(martial_files)
print(martial_raw[:200])

Martial I
		 

		 
		 
	 
	
 

 M. VALERI MARTIALIS EPIGRAMMATON LIBER I
 

 

 
 Prologus 
 

 
1. Spero me secutum in libellis meis tale temperamentum ut de illis queri non possit quisquis de se ben


In [14]:
# remove headings from the text

def preprocess(text):
    
    remove_list = [r'\bThe Latin Library\b',
                   r'\bThe Classics Page\b',
                   r'\bMartial\b',
                   r'\bM. VALERI MARTIALIS EPIGRAMMATON LIBER I\b',
                   r'8a', r'8b', r'VIIIA', r'VIIIB',
                   r'\bPrologus\b',
                   r'\bMartial Book I\b',
                   r'\bMartial',
                   r'I II III IV V VI VII VIII IX X',
                   r'25a', r'25b'
                  ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
        
    # Remove html entities and related html artifacts
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    text = re.sub(r' \xa0 ', '    ', text)
    
    
    # Lowercase text
    text = text.lower()
    
   # Normalize text
    text = replacer.replace(text) #Normalize u/v & i/j
    
    # Remove punctuation with translate
    punctuation ="\"#$%&\'()+,-/:;<=>@[\]^_`{|}~.?!«»—"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    # Handle spacing
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\t',' ', text) # Remove tabs
    text = re.sub('^\s+','', text)
    text = re.sub(' \n', '\n', text)
    text = re.sub('\n\n', '~', text)
    text = re.sub('~+', '\n\n', text)
    
    return text

In [15]:
# Preprocess texts

martial_edit = preprocess(martial_raw)

print (martial_edit[:200])

i
  

  
  
 

 spero me secutum in libellis meis tale temperamentum ut de illis queri non possit quisquis de se bene senserit cum salua infirmarum quoque personarum reuerentia ludant quae adeo antiqu


In [16]:
#tokenizing individual words in the text

martial_tokens = word_tokenizer.tokenize (martial_edit)

print (martial_tokens[:50])

['i', 'spero', 'me', 'secutum', 'in', 'libellis', 'meis', 'tale', 'temperamentum', 'ut', 'de', 'illis', 'queri', 'non', 'possit', 'quisquis', 'de', 'se', 'bene', 'senserit', 'cum', 'salua', 'infirmarum', 'quoque', 'personarum', 'reuerentia', 'ludant', 'quae', 'adeo', 'antiquis', 'auctoribus', 'defuit', 'ut', 'nominibus', 'non', 'tantum', 'ueris', 'abusi', 'sint', 'sed', 'et', 'magnis', 'mihi', 'fama', 'uilius', 'constet', 'et', 'probetur', 'in', 'me']


In [19]:
#lemmatizing individual words, i.e. finding their dictionary entry, for easier comparison

martial_lemmas = lemmatizer.lemmatize(martial_tokens)
print(martial_lemmas[:50])

[('i', 'eo'), ('spero', 'spero'), ('me', 'ego'), ('secutum', 'sequor'), ('in', 'in'), ('libellis', 'libellus'), ('meis', 'meus'), ('tale', 'tal'), ('temperamentum', 'temperamentum'), ('ut', 'ut'), ('de', 'de'), ('illis', 'ille'), ('queri', 'queror'), ('non', 'non'), ('possit', 'possum'), ('quisquis', 'quisquis'), ('de', 'de'), ('se', 'sui'), ('bene', 'bene'), ('senserit', 'sentio'), ('cum', 'cum2'), ('salua', 'saluus'), ('infirmarum', 'infirmaris'), ('quoque', 'quoque'), ('personarum', 'personaris'), ('reuerentia', 'reverendus'), ('ludant', 'ludo'), ('quae', 'qui'), ('adeo', 'adeo'), ('antiquis', 'antiquus'), ('auctoribus', 'auctor'), ('defuit', 'desum'), ('ut', 'ut'), ('nominibus', 'nomen'), ('non', 'non'), ('tantum', 'tantus'), ('ueris', 'verus'), ('abusi', 'abutor'), ('sint', 'sum'), ('sed', 'sed'), ('et', 'et'), ('magnis', 'magnus'), ('mihi', 'ego'), ('fama', 'fama'), ('uilius', 'vilis'), ('constet', 'consto'), ('et', 'et'), ('probetur', 'probo'), ('in', 'in'), ('me', 'ego')]


In [20]:
#finding each occurrence of the word 'futuo', and printing the form used in the text

f = [item[0] for item in martial_lemmas if item[1] == 'futuo']
print(set(f))

{'futuente', 'futuisse', 'futuant', 'futuit', 'futuis', 'futuat', 'futuam', 'fututa', 'futui', 'fututam'}


In [22]:
#counting the number of occurrences of each form

fcount = Counter(f)
print(fcount)

Counter({'futuit': 11, 'futui': 10, 'futuis': 8, 'fututam': 5, 'futuat': 4, 'futuisse': 3, 'futuam': 3, 'fututa': 1, 'futuant': 1, 'futuente': 1})


In [23]:
#total word count in the whole text

martial_wordcount = len(martial_lemmas)
print(martial_wordcount)

59110


In [24]:
#total times 'futuo' occurrs in the text

totalf = sum(fcount.values())
print(totalf)

47


In [25]:
#martial table of occurrences of 'futuo'

martialfcount = OrderedDict([ ('futuo forms', ['futuit', 'futui', 'futuis', 'fututam', 'futuat', 'futuisse', 'futuam', 'fututa', 'futuant', 'futuente', ' ', 'total count', 'total tokens', '% of total']), 
                            ('occurrences', [11, 10, 8, 5, 4, 3, 3, 1, 1, 1, '', 47, 59110, .08]) ] )
    
df1 = pd.DataFrame.from_dict(martialfcount)
display(df1)

Unnamed: 0,futuo forms,occurrences
0,futuit,11.0
1,futui,10.0
2,futuis,8.0
3,fututam,5.0
4,futuat,4.0
5,futuisse,3.0
6,futuam,3.0
7,fututa,1.0
8,futuant,1.0
9,futuente,1.0


In [26]:
#begin processing Catullus data

In [27]:
from cltk.corpus.latin import latinlibrary
files = latinlibrary.fileids()
catullus_raw = latinlibrary.raw('catullus.txt')

In [28]:
print(catullus_raw[:300])

Catullus
		 
		 
		 
		 
		 
	 
 

 C. VALERIVS CATVLLVS 
 
 1 2 2b 3 4 5  6 7 8 9 10 11 12 13 14 14b 15 16 17 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 58b 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 


In [29]:
catullus_edit = catullus_raw # Make a copy of the list

# 1. Make the whole text lowercase
# Use 'lower' string method

catullus_edit = catullus_edit.lower()

# 2. Remove punctuation
# Use 'translate'

from string import punctuation

translator = str.maketrans({key: " " for key in punctuation})
catullus_edit = catullus_edit.translate(translator)

# 3. Remove numbers
# Again, use 'translate'

translator = str.maketrans({key: " " for key in '0123456789'})
catullus_edit = catullus_edit.translate(translator)

# 4. Normalize u/v
# Use CLTK 'JVReplacer'

from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

catullus_edit = replacer.replace(catullus_edit)

# 5. Remove English words that appear in our plaintext file
# Use 'replace'

remove_list = ['the', 'latin', 'library', 'classics', 'page']
remove_dict = {key: ' ' for key in remove_list}

for k, v in remove_dict.items():
    catullus_edit = catullus_edit.replace(k,v)

In [30]:
#checking the edited text

print(catullus_edit[469:599])

cui dono lepidum nouum libellum 
arida modo pumice expolitum  
corneli  tibi  namque tu solebas 
meas esse aliquid putare nugas


In [31]:
#tokenizing 

catullus_tokens = word_tokenizer.tokenize (catullus_edit)

print (catullus_tokens[:100])

['catullus', 'c', 'ualerius', 'catullus', 'b', 'b', 'b', 'b', 'i', 'ad', 'cornelium', 'cui', 'dono', 'lepidum', 'nouum', 'libellum', 'arida', 'modo', 'pumice', 'expolitum', 'corneli', 'tibi', 'namque', 'tu', 'solebas', 'meas', 'esse', 'aliquid', 'putare', 'nugas', 'iam', 'tum', 'cum', 'ausus', 'es', 'unus', 'italorum', 'omne', 'aeuum', 'tribus', 'explicare', 'cartis', 'doctis', 'iuppiter', 'et', 'laboriosis', 'quare', 'habe', 'tibi', 'quidquid', 'hoc', 'libelli—', 'qualecum', '-que', 'quod', 'o', 'patrona', 'uirgo', 'plus', 'uno', 'maneat', 'perenne', 'saeclo', 'ii', 'fletus', 'passeris', 'lesbiae', 'passer', 'deliciae', 'meae', 'puellae', 'cum', 'qui', 'ludere', 'quem', 'in', 'sinu', 'tenere', 'cui', 'primum', 'digitum', 'dare', 'appetenti', 'et', 'acris', 'solet', 'incitare', 'morsus', 'cum', 'desiderio', 'meo', 'nitenti', 'carum', 'nescio', 'quid', 'lubet', 'iocari', 'et', 'solaciolum', 'sui']


In [32]:
#lemmatizing

catullus_lemmas = lemmatizer.lemmatize(catullus_tokens)
print(catullus_lemmas[:50])

[('catullus', 'catullus'), ('c', '-que'), ('ualerius', 'ualerius'), ('catullus', 'catullus'), ('b', 'b'), ('b', 'b'), ('b', 'b'), ('b', 'b'), ('i', 'eo'), ('ad', 'ad'), ('cornelium', 'cornelius'), ('cui', 'qui'), ('dono', 'donum'), ('lepidum', 'lepidus'), ('nouum', 'nouus'), ('libellum', 'libellus'), ('arida', 'aridus'), ('modo', 'modo'), ('pumice', 'pumex'), ('expolitum', 'expolio'), ('corneli', 'cornelis'), ('tibi', 'tu'), ('namque', 'namque'), ('tu', 'tu'), ('solebas', 'soleo'), ('meas', 'meus'), ('esse', 'sum'), ('aliquid', 'aliquis'), ('putare', 'putaris'), ('nugas', 'nugae'), ('iam', 'iam'), ('tum', 'tum'), ('cum', 'cum2'), ('ausus', 'audeo'), ('es', 'sum'), ('unus', 'unus'), ('italorum', 'italorum'), ('omne', 'omnis'), ('aeuum', 'aeuum'), ('tribus', 'tres'), ('explicare', 'explicaris'), ('cartis', 'cartis'), ('doctis', 'doceo'), ('iuppiter', 'iuppiter'), ('et', 'et'), ('laboriosis', 'laboriosus'), ('quare', 'quare'), ('habe', 'habeo'), ('tibi', 'tu'), ('quidquid', 'quisquis')]


In [33]:
#counting the number of occurrences of 'futuo'

f2 = [item[0] for item in catullus_lemmas if item[1] == 'futuo']
print(set(f2))

{'futuit'}


In [34]:
f2count = Counter(f2)
print(f2count)

Counter({'futuit': 2})


In [35]:
#total word count of Catullus corpus

catullus_wordcount = len(catullus_lemmas)
print(catullus_wordcount)

13503


In [36]:
#Catullus table of occurrences of 'futuo'

catullusfcount = OrderedDict([ ('futuo forms', ['futuit', ' ', 'total count', 'total tokens', '% of total']), 
                            ('occurrences', [2, '', 2, 13503, .02]) ] )
    
df2 = pd.DataFrame.from_dict(catullusfcount)
display(df2)

Unnamed: 0,futuo forms,occurrences
0,futuit,2.0
1,,
2,total count,2.0
3,total tokens,13503.0
4,% of total,0.02


In [None]:
#begin Graffiti text processing

In [40]:
#read CSV files from graffiti archives

ep = pd.read_csv("/Users/erichensley1/Desktop/DH final project/ep.csv")
print(ep)

    CIL number  page                                        Inscription
0         5092    19  amoris ignes si sentires, mulio, magis propera...
1         1796    20  amplexus teneros hac si quis quaerit in urbe, ...
2         2146    21  vibius Restitutus hic solus dormivit et Urbana...
3        10241    21  Primigenae Nucerinae salutem / vellem essem ge...
4          NaN    22  felicem somnum qui tecum nocte quiescet? hoc e...
5         2483    24    Mansuetus provacator victor Veneri permam feret
6         4007    24  tu, pupa, sic valeas, sic habeas Venere Pompei...
7          538    25   abiat Venere Bompeiiana iratam qui hoc laesaerit
8         1410    25  Venus enim plagiaria est; quia exsanguni meum ...
9         1839    27  Agato Herrenni servus rogat Venere ut periat rogo
10        1824    27  quisquis amat, veniat. Veneri volo frangere co...
11        3691    29  non ego tam duco Venere de marmore factam secu...
12        1625    30                                    Venus es

In [42]:
#setting the 'inscription' column to a variable

inscriptions = ep["Inscription"]

In [43]:
#combining the data into a list

inscriptions_edit = [preprocess(item) for item in list(ep['Inscription'])]
print(inscriptions_edit[0])

amoris ignes si sentires mulio magis properares ut uideres uenerem diligo iuuenem uenustum rogo punge iamus bibisti iamus prende lora et excute pompeios defer ubi dulcis est amor meus es 


In [45]:
#graffiti table of occurrences of 'futuo'

martialfcount = OrderedDict([ ('futuo forms', ['futuit', 'futui', 'futua', 'futuebatur', 'futuitur', 'futuisse', 'fututu', 'perfutuor', 'difutuisti', 'futuet', 'futuimus', 'futues', 'fututa sum', 'futue', 'futuere', ' ', 'total count', 'total tokens', '% of total']), 
                            ('occurrences', [4, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, '', 23, 1255, 1.83]) ] )
    
df3 = pd.DataFrame.from_dict(martialfcount)
display(df3)

Unnamed: 0,futuo forms,occurrences
0,futuit,4.0
1,futui,4.0
2,futua,2.0
3,futuebatur,2.0
4,futuitur,1.0
5,futuisse,1.0
6,fututu,1.0
7,perfutuor,1.0
8,difutuisti,1.0
9,futuet,1.0
