Obscene Language in Pompeian Graffiti: 
a text analysis project

by Eric Hensley
ISAW Intro to Digital Humaities (2017)

In [1]:
#imports 1
# from Sebastian's Column Oriented Data tutorial
import gzip
import io
import urllib.request

import sqlite3

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

#import pretty print
from pprint import pprint

1. Taking text data from The Latin Library online
www.thelatinlibrary.com

In [2]:
# Get the Latin Library corpus

from cltk.corpus.latin import latinlibrary

In [3]:
ll_files = latinlibrary.fileids()
print(ll_files[:10]) 

['12tables.txt', '1644.txt', 'abbofloracensis.txt', 'abelard/dialogus.txt', 'abelard/epistola.txt', 'abelard/historia.txt', 'addison/barometri.txt', 'addison/burnett.txt', 'addison/hannes.txt', 'addison/machinae.txt']


2. Assembling a searchable corpus of Martial's epigrams

In [4]:
#Martial data from LL
#find Martial files

martial_files = [file for file in ll_files if 'martial/' in file] 
martial_files = [file for file in martial_files if not 'spec' in file]
print(martial_files)

['martial/mart1.txt', 'martial/mart10.txt', 'martial/mart11.txt', 'martial/mart12.txt', 'martial/mart13.txt', 'martial/mart14.txt', 'martial/mart2.txt', 'martial/mart3.txt', 'martial/mart4.txt', 'martial/mart5.txt', 'martial/mart6.txt', 'martial/mart7.txt', 'martial/mart8.txt', 'martial/mart9.txt']


In [5]:
#making Martial file into a string

martial_raw = latinlibrary.raw(martial_files)
print(martial_raw[:200])

Martial I
		 

		 
		 
	 
	
 

 M. VALERI MARTIALIS EPIGRAMMATON LIBER I
 

 

 
 Prologus 
 

 
1. Spero me secutum in libellis meis tale temperamentum ut de illis queri non possit quisquis de se ben


In [6]:
# Imports for text preprocessing

import re # Regex module, useful for pattern matching
import html # Useful for handling entities

# Import/load a CLTK tool for normalizing i/j and u/v in Latin texts
from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

In [7]:
# remove headings from the text

def preprocess(text):
    
    remove_list = [r'\bThe Latin Library\b',
                   r'\bThe Classics Page\b',
                   r'\bMartial\b',
                   r'\bM. VALERI MARTIALIS EPIGRAMMATON LIBER I\b',
                   r'8a', r'8b', r'VIIIA', r'VIIIB',
                   r'\bPrologus\b',
                   r'\bMartial Book I\b',
                   r'\bMartial',
                   r'I II III IV V VI VII VIII IX X',
                   r'25a', r'25b'
                  ]
    
    for pattern in remove_list:
        text = re.sub(pattern, '', text)
        
    # Remove html entities and related html artifacts
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    text = re.sub(r' \xa0 ', '    ', text)
    
    
    # Lowercase text
    text = text.lower()

    # Normalize text
    text = replacer.replace(text) #Normalize u/v & i/j
    
    # Remove punctuation with translate
    punctuation ="\"#$%&\'()+,-/:;<=>@[\]^_`{|}~.?!«»—"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    # Handle spacing
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\t',' ', text) # Remove tabs
    text = re.sub('^\s+','', text)
    text = re.sub(' \n', '\n', text)
    text = re.sub('\n\n', '~', text)
    text = re.sub('~+', '\n\n', text)
    
    return text

In [8]:
# Preprocess texts

martial_edit = preprocess(martial_raw)

print (martial_edit[:200])

i
  

  
  
 

 spero me secutum in libellis meis tale temperamentum ut de illis queri non possit quisquis de se bene senserit cum salua infirmarum quoque personarum reuerentia ludant quae adeo antiqu


In [9]:
# Set up CLTK Latin word tokenizer

from cltk.tokenize.word import WordTokenizer
word_tokenizer = WordTokenizer('latin')

In [10]:
martial_tokens = word_tokenizer.tokenize (martial_edit)

print (martial_tokens[:50])

['i', 'spero', 'me', 'secutum', 'in', 'libellis', 'meis', 'tale', 'temperamentum', 'ut', 'de', 'illis', 'queri', 'non', 'possit', 'quisquis', 'de', 'se', 'bene', 'senserit', 'cum', 'salua', 'infirmarum', 'quoque', 'personarum', 'reuerentia', 'ludant', 'quae', 'adeo', 'antiquis', 'auctoribus', 'defuit', 'ut', 'nominibus', 'non', 'tantum', 'ueris', 'abusi', 'sint', 'sed', 'et', 'magnis', 'mihi', 'fama', 'uilius', 'constet', 'et', 'probetur', 'in', 'me']


In [11]:
# We need to import a data model to train the lemmatizer.

import os
from cltk.utils.file_operations import open_pickle

# Set up training sentences

rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
path = os.path.expanduser(rel_path)

# Check for presence of latin_pos_lemmatized_sents
file = 'latin_pos_lemmatized_sents.pickle'      

latin_pos_lemmatized_sents_path = os.path.join(path, file)
if os.path.isfile(latin_pos_lemmatized_sents_path):
    latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
else:
    latin_pos_lemmatized_sents = []
    print('The file %s is not available in cltk_data' % file)

In [12]:
# Set up CLTK Latin backoff lemmatizer

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)

In [13]:
martial_lemmas = lemmatizer.lemmatize(martial_tokens)

In [14]:
print(martial_lemmas)

[('i', 'eo'), ('spero', 'spero'), ('me', 'ego'), ('secutum', 'sequor'), ('in', 'in'), ('libellis', 'libellus'), ('meis', 'meus'), ('tale', 'tal'), ('temperamentum', 'temperamentum'), ('ut', 'ut'), ('de', 'de'), ('illis', 'ille'), ('queri', 'queror'), ('non', 'non'), ('possit', 'possum'), ('quisquis', 'quisquis'), ('de', 'de'), ('se', 'sui'), ('bene', 'bene'), ('senserit', 'sentio'), ('cum', 'cum2'), ('salua', 'saluus'), ('infirmarum', 'infirmaris'), ('quoque', 'quoque'), ('personarum', 'personaris'), ('reuerentia', 'reverendus'), ('ludant', 'ludo'), ('quae', 'qui'), ('adeo', 'adeo'), ('antiquis', 'antiquus'), ('auctoribus', 'auctor'), ('defuit', 'desum'), ('ut', 'ut'), ('nominibus', 'nomen'), ('non', 'non'), ('tantum', 'tantus'), ('ueris', 'verus'), ('abusi', 'abutor'), ('sint', 'sum'), ('sed', 'sed'), ('et', 'et'), ('magnis', 'magnus'), ('mihi', 'ego'), ('fama', 'fama'), ('uilius', 'vilis'), ('constet', 'consto'), ('et', 'et'), ('probetur', 'probo'), ('in', 'in'), ('me', 'ego'), ('nou

In [15]:
#counting the number of times 'futuo' occurs

f = [item[0] for item in martial_lemmas if item[1] == 'futuo']

In [16]:
print(set(f))

{'futuam', 'futuat', 'futuis', 'futuisse', 'fututa', 'futuente', 'futui', 'futuit', 'fututam', 'futuant'}


In [17]:
#finding the number of times 'eo' occurs

eo1 = [item[0] for item in martial_lemmas if item[1] == 'eo']
print(set(eo1))

{'ite', 'euntem', 'ire', 'ibatis', 'itur', 'ibit', 'iturus', 'it', 'eat', 'ibitis', 'eamus', 'ibat', 'ibam', 'ibis', 'i', 'ibo', 'isse'}


In [63]:
#finding the number of times 'pedico' occurs

ped1 = [item[0] for item in martial_lemmas if item[1] == 'pedico']
print(set(ped1))

{'pedico'}


In [71]:
#finding the number of times 'fello' occurs

fel1 = [item[0] for item in martial_lemmas if item[1] == 'fello']
print(set(fel1))

{'fellas', 'fellat', 'fellaret'}


In [77]:
#finding the number of times 'habeo' occurs

hab1 = [item[0] for item in martial_lemmas if item[1] == 'habeo']
print(set(hab1))

{'habeat', 'habent', 'haberi', 'habe', 'habeto', 'habui', 'haberis', 'habeo', 'habendam', 'habebunt', 'habet', 'habes', 'habemus', 'habiturus', 'habere', 'habuit', 'habeas', 'habetur', 'haberet', 'habete', 'habuisse', 'habebas', 'habebis'}


In [18]:
from collections import Counter

In [60]:
#counting 'futuo'

fcount = Counter(f)
print(fcount)

Counter({'futuit': 11, 'futui': 10, 'futuis': 8, 'fututam': 5, 'futuat': 4, 'futuisse': 3, 'futuam': 3, 'fututa': 1, 'futuant': 1, 'futuente': 1})


In [21]:
#counting 'eo' 

eocount1 = Counter(eo1)
print(eocount1)

Counter({'i': 24, 'ire': 19, 'eat': 8, 'ibat': 5, 'ibis': 4, 'ite': 4, 'ibit': 4, 'ibo': 2, 'it': 2, 'iturus': 2, 'eamus': 2, 'itur': 1, 'ibitis': 1, 'ibatis': 1, 'isse': 1, 'euntem': 1, 'ibam': 1})


In [61]:
#counting 'pedico' 

pedcount1 = Counter(ped1)
print(pedcount1)

Counter({'pedico': 3})


In [78]:
#counting 'fello' 
felcount1 = Counter(fel1)
print(felcount1)

Counter({'fellat': 8, 'fellas': 2, 'fellaret': 1})


In [79]:
#counting 'habeo' 
habcount1 = Counter(hab1)
print(habcount1)

Counter({'habet': 83, 'habere': 42, 'habes': 35, 'habeat': 9, 'habent': 7, 'habeas': 7, 'habe': 6, 'habeo': 5, 'habuit': 5, 'habebis': 3, 'habebas': 2, 'habemus': 2, 'haberet': 2, 'habetur': 2, 'habendam': 1, 'habeto': 1, 'habuisse': 1, 'haberis': 1, 'habiturus': 1, 'haberi': 1, 'habui': 1, 'habete': 1, 'habebunt': 1})


In [22]:
martial_wordcount = len(martial_lemmas)
print(martial_wordcount)

59110


In [23]:
#futuo

totalf = sum(fcount.values())
print(totalf)

47


In [80]:
#eo total

totaleo = sum(eocount1.values())
print(totaleo)

82


In [81]:
#habeo total

totalhab = sum(habcount1.values())
print(totalhab)

219


In [25]:
#importing table-generating tools
import pandas as pd
from collections import OrderedDict
from datetime import date

In [26]:
#martial table of occurrences of 'futuo'

martialfcount = OrderedDict([ ('futuo forms', ['futuit', 'futui', 'futuis', 'fututam', 'futuat', 'futuisse', 'futuam', 'fututa', 'futuant', 'futuente', ' ', 'total count', 'total tokens', '% of total']), 
                            ('occurrences', [11, 10, 8, 5, 4, 3, 3, 1, 1, 1, '', 47, 59110, .08]) ] )
    
df1 = pd.DataFrame.from_dict(martialfcount)
print(df1)

     futuo forms occurrences
0         futuit          11
1          futui          10
2         futuis           8
3        fututam           5
4         futuat           4
5       futuisse           3
6         futuam           3
7         fututa           1
8        futuant           1
9       futuente           1
10                          
11   total count          47
12  total tokens       59110
13    % of total        0.08


3. Assembling a searchable corpus of Catullus' poetry

In [28]:
from cltk.corpus.latin import latinlibrary
files = latinlibrary.fileids()
catullus_raw = latinlibrary.raw('catullus.txt')

In [29]:
print(catullus_raw[:300])

Catullus
		 
		 
		 
		 
		 
	 
 

 C. VALERIVS CATVLLVS 
 
 1 2 2b 3 4 5  6 7 8 9 10 11 12 13 14 14b 15 16 17 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 58b 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 


In [30]:
catullus_edit = catullus_raw # Make a copy of the list

# 1. Make the whole text lowercase
# Use 'lower' string method

catullus_edit = catullus_edit.lower()

# 2. Remove punctuation
# Use 'translate'

from string import punctuation

translator = str.maketrans({key: " " for key in punctuation})
catullus_edit = catullus_edit.translate(translator)

# 3. Remove numbers
# Again, use 'translate'

translator = str.maketrans({key: " " for key in '0123456789'})
catullus_edit = catullus_edit.translate(translator)

# 4. Normalize u/v
# Use CLTK 'JVReplacer'

from cltk.stem.latin.j_v import JVReplacer
replacer = JVReplacer()

catullus_edit = replacer.replace(catullus_edit)

# 5. Remove English words that appear in our plaintext file
# Use 'replace'

remove_list = ['the', 'latin', 'library', 'classics', 'page']
remove_dict = {key: ' ' for key in remove_list}

for k, v in remove_dict.items():
    catullus_edit = catullus_edit.replace(k,v)

In [31]:
print(catullus_edit[469:599])

cui dono lepidum nouum libellum 
arida modo pumice expolitum  
corneli  tibi  namque tu solebas 
meas esse aliquid putare nugas


In [32]:
catullus_tokens = word_tokenizer.tokenize (catullus_edit)

print (catullus_tokens[:100])

['catullus', 'c', 'ualerius', 'catullus', 'b', 'b', 'b', 'b', 'i', 'ad', 'cornelium', 'cui', 'dono', 'lepidum', 'nouum', 'libellum', 'arida', 'modo', 'pumice', 'expolitum', 'corneli', 'tibi', 'namque', 'tu', 'solebas', 'meas', 'esse', 'aliquid', 'putare', 'nugas', 'iam', 'tum', 'cum', 'ausus', 'es', 'unus', 'italorum', 'omne', 'aeuum', 'tribus', 'explicare', 'cartis', 'doctis', 'iuppiter', 'et', 'laboriosis', 'quare', 'habe', 'tibi', 'quidquid', 'hoc', 'libelli—', 'qualecum', '-que', 'quod', 'o', 'patrona', 'uirgo', 'plus', 'uno', 'maneat', 'perenne', 'saeclo', 'ii', 'fletus', 'passeris', 'lesbiae', 'passer', 'deliciae', 'meae', 'puellae', 'cum', 'qui', 'ludere', 'quem', 'in', 'sinu', 'tenere', 'cui', 'primum', 'digitum', 'dare', 'appetenti', 'et', 'acris', 'solet', 'incitare', 'morsus', 'cum', 'desiderio', 'meo', 'nitenti', 'carum', 'nescio', 'quid', 'lubet', 'iocari', 'et', 'solaciolum', 'sui']


In [33]:
catullus_lemmas = lemmatizer.lemmatize(catullus_tokens)

In [34]:
print(catullus_lemmas)

[('catullus', 'catullus'), ('c', '-que'), ('ualerius', 'ualerius'), ('catullus', 'catullus'), ('b', 'b'), ('b', 'b'), ('b', 'b'), ('b', 'b'), ('i', 'eo'), ('ad', 'ad'), ('cornelium', 'cornelius'), ('cui', 'qui'), ('dono', 'donum'), ('lepidum', 'lepidus'), ('nouum', 'nouus'), ('libellum', 'libellus'), ('arida', 'aridus'), ('modo', 'modo'), ('pumice', 'pumex'), ('expolitum', 'expolio'), ('corneli', 'cornelis'), ('tibi', 'tu'), ('namque', 'namque'), ('tu', 'tu'), ('solebas', 'soleo'), ('meas', 'meus'), ('esse', 'sum'), ('aliquid', 'aliquis'), ('putare', 'putaris'), ('nugas', 'nugae'), ('iam', 'iam'), ('tum', 'tum'), ('cum', 'cum2'), ('ausus', 'audeo'), ('es', 'sum'), ('unus', 'unus'), ('italorum', 'italorum'), ('omne', 'omnis'), ('aeuum', 'aeuum'), ('tribus', 'tres'), ('explicare', 'explicaris'), ('cartis', 'cartis'), ('doctis', 'doceo'), ('iuppiter', 'iuppiter'), ('et', 'et'), ('laboriosis', 'laboriosus'), ('quare', 'quare'), ('habe', 'habeo'), ('tibi', 'tu'), ('quidquid', 'quisquis'), (

In [35]:
# Note that the lemmatizer is not the issue with finding forms of 'futuo' in Catullus. Here is a
# blunt, single-purpose stemmer that shows 'fut-' only appears 5 times and of these of 2 (both
# 'futuit') are the only examples from this verb. This is confirmed by a PHI search, e.g. 
# http://latin.packhum.org/search?q=%5Bcatul%5D+%23fut
f_stemmer = [item[0] for item in catullus_lemmas if item[0].startswith('fut')]
print(f_stemmer)

['fututiones', 'futurus', 'futurus', 'futuit', 'futuit']


In [36]:
#finding futuo

f2 = [item[0] for item in catullus_lemmas if item[1] == 'futuo']
print(set(f2))

{'futuit'}


In [37]:
#finding eo

eo2 = [item[0] for item in catullus_lemmas if item[1] == 'eo']
print(set(eo2))

{'euntem', 'ite', 'ire', 'it', 'eat', 'itis', 'i'}


In [69]:
#finding pedico

ped2 = [item[0] for item in catullus_lemmas if item[1] == 'pedico']
print(set(ped2))

{'pedicabo'}


In [73]:
#finding fello

fel2 = [item[0] for item in catullus_lemmas if item[1] == 'fello']
print(set(fel2))

{'fellat'}


In [82]:
#finding habeo

hab2 = [item[0] for item in catullus_lemmas if item[1] == 'habeo']
print(set(hab2))

{'habeant', 'habe', 'habuit', 'haberes', 'habet', 'haberet', 'habes', 'habent', 'habebat', 'habere', 'habetis'}


In [38]:
#counting futuo

f2count = Counter(f_stemmer)
print(f2count)

Counter({'futurus': 2, 'futuit': 2, 'fututiones': 1})


In [39]:
#counting 'eo' 

eocount2 = Counter(eo2)
print(eocount2)

Counter({'i': 4, 'ite': 4, 'eat': 2, 'ire': 2, 'it': 1, 'itis': 1, 'euntem': 1})


In [65]:
#counting 'pedico' 

pedcount2 = Counter(ped2)
print(pedcount2)

Counter({'pedicabo': 2})


In [74]:
#counting 'fello' 

felcount2 = Counter(fel2)
print(felcount2)

Counter({'fellat': 1})


In [83]:
#counting 'habeo' 

habcount2 = Counter(hab2)
print(habcount2)

Counter({'habet': 7, 'habere': 5, 'habes': 2, 'habent': 2, 'habe': 1, 'haberet': 1, 'habetis': 1, 'habebat': 1, 'habeant': 1, 'haberes': 1, 'habuit': 1})


In [40]:
catullus_wordcount = len(catullus_lemmas)
print(catullus_wordcount)

13503


In [41]:
#Catullus table of occurrences of 'futuo'

catullusfcount = OrderedDict([ ('futuo forms', ['futuit', ' ', 'total count', 'total tokens', '% of total']), 
                            ('occurrences', [2, '', 2, 13503, .02]) ] )
    
df2 = pd.DataFrame.from_dict(catullusfcount)
print(df2)

    futuo forms occurrences
0        futuit           2
1                          
2   total count           2
3  total tokens       13503
4    % of total        0.02


4. Assembling a searchable corpus of Pompeian graffiti from a CSV assembled from the book Erotica Pompeiana by Antonio Varone

In [43]:
#read CSV files from graffiti archives

ep = pd.read_csv("ep.csv")

In [44]:
inscriptions = ep["Inscription"]

In [45]:
inscriptions_edit = [preprocess(item) for item in list(ep['Inscription'])]
print(inscriptions_edit[0])

amoris ignes si sentires mulio magis properares ut uideres uenerem diligo iuuenem uenustum rogo punge iamus bibisti iamus prende lora et excute pompeios defer ubi dulcis est amor meus es 


In [46]:
print(inscriptions[:10])

0    amoris ignes si sentires, mulio, magis propera...
1    amplexus teneros hac si quis quaerit in urbe, ...
2    vibius Restitutus hic solus dormivit et Urbana...
3    Primigenae Nucerinae salutem / vellem essem ge...
4    felicem somnum qui tecum nocte quiescet? hoc e...
5      Mansuetus provacator victor Veneri permam feret
6    tu, pupa, sic valeas, sic habeas Venere Pompei...
7     abiat Venere Bompeiiana iratam qui hoc laesaerit
8    Venus enim plagiaria est; quia exsanguni meum ...
9    Agato Herrenni servus rogat Venere ut periat rogo
Name: Inscription, dtype: object


In [47]:
from IPython.display import display, HTML


In [48]:
#errors
#unable to separate words and tokeize

# PJB: Not sure where the problem was occurring, but the cells below solve it--
# The dataframe had a column of data that you had turned into a list (good!). In the first cell below,
# I joined the list items together into a single string. In the second and third, I tokenize that string and
# lemmatize the tokens. This should be enough to let you continue with the analysis.

In [49]:
# put all of the texts into one string
inscriptions_text = " ".join(inscriptions_edit)
print(inscriptions_text[:1000])

amoris ignes si sentires mulio magis properares ut uideres uenerem diligo iuuenem uenustum rogo punge iamus bibisti iamus prende lora et excute pompeios defer ubi dulcis est amor meus es  amplexus teneros hac si quis quaerit in urbe expectat ceras nulla puella uiri uibius restitutus hic solus dormiuit et urbanam suam desiderabat primigenae nucerinae salutem uellem essem gemma ora non amplius una ut tibi signanti oscula pressa darem felicem somnum qui tecum nocte quiescet hoc ego sic facere multo felicior esse mansuetus prouacator uictor ueneri permam feret tu pupa sic ualeas sic habeas uenere pompeianam propytia munn  abiat uenere bompeiiana iratam qui hoc laesaerit uenus enim plagiaria est quia exsanguni meum petit in uies tumultu pariet optet sibi ut bene nauiget quod et ario sua rogat agato herrenni seruus rogat uenere ut periat rogo quisquis amat ueniat ueneri uolo frangere costas fustibus et lumbos debilitare deae si postest illa mihi tenerum pertundere pectus quit ego non possim 

In [50]:
inscriptions_tokens = word_tokenizer.tokenize(inscriptions_text)
print(inscriptions_tokens[:50])

['amoris', 'ignes', 'si', 'sentires', 'mulio', 'magis', 'properares', 'ut', 'uideres', 'uenerem', 'diligo', 'iuuenem', 'uenustum', 'rogo', 'punge', 'iamus', 'bibisti', 'iamus', 'prende', 'lora', 'et', 'excute', 'pompeios', 'defer', 'ubi', 'dulcis', 'est', 'amor', 'meus', 'es', 'amplexus', 'teneros', 'hac', 'si', 'quis', 'quaerit', 'in', 'urbe', 'expectat', 'ceras', 'nulla', 'puella', 'uiri', 'uibius', 'restitutus', 'hic', 'solus', 'dormiuit', 'et', 'urbanam']


In [51]:
inscriptions_lemmas = lemmatizer.lemmatize(inscriptions_tokens)

In [52]:
print(inscriptions_lemmas)

[('amoris', 'amor'), ('ignes', 'ignis'), ('si', 'si'), ('sentires', 'sentio'), ('mulio', 'mulio1'), ('magis', 'magis'), ('properares', 'properaris'), ('ut', 'ut'), ('uideres', 'uideo'), ('uenerem', 'uenerem'), ('diligo', 'diligo'), ('iuuenem', 'iuuenis'), ('uenustum', 'venustus'), ('rogo', 'rogo'), ('punge', 'punge'), ('iamus', 'iamus'), ('bibisti', 'bibo'), ('iamus', 'iamus'), ('prende', 'prehendo'), ('lora', 'lorum'), ('et', 'et'), ('excute', 'excutio'), ('pompeios', 'pompeios'), ('defer', 'defero'), ('ubi', 'ubi'), ('dulcis', 'dulcis'), ('est', 'sum'), ('amor', 'amor'), ('meus', 'meus'), ('es', 'sum'), ('amplexus', 'amplector'), ('teneros', 'tener'), ('hac', 'hic'), ('si', 'si'), ('quis', 'quis'), ('quaerit', 'quaero'), ('in', 'in'), ('urbe', 'urbs'), ('expectat', 'ex-pecto'), ('ceras', 'cero'), ('nulla', 'nullus'), ('puella', 'puella'), ('uiri', 'uir'), ('uibius', 'uibius'), ('restitutus', 'restituo'), ('hic', 'hic'), ('solus', 'solus'), ('dormiuit', 'dormio'), ('et', 'et'), ('urba

In [53]:
#occurrences of futuo

f3 = [item[0] for item in inscriptions_lemmas if item[1] == 'futuo']
print(set(f3))

{'futues', 'futuere', 'fututa', 'futui', 'futuit'}


In [54]:
#counting futuo

f3count = Counter(f3)
print(f3count)

Counter({'futui': 4, 'futuit': 4, 'fututa': 1, 'futues': 1, 'futuere': 1})


In [55]:
#occurrences of eo

eo3 = [item[0] for item in inscriptions_lemmas if item[1] == 'eo']
print(set(eo3))

{'i', 'ire', 'isse'}


In [56]:
#counting 'eo' 

eocount3 = Counter(eo3)
print(eocount3)

Counter({'i': 3, 'isse': 1, 'ire': 1})


In [67]:
#occurrences of pedico/paedico

ped3 = [item[0] for item in inscriptions_lemmas if item[1] == 'paedico']
print(set(ped3))

{'paedico'}


In [70]:
#counting 'pedico/paedico' 

pedcount3 = Counter(ped3)
print(pedcount3)

Counter({'paedico': 1})


In [75]:
#occurrences of fello

fel3 = [item[0] for item in inscriptions_lemmas if item[1] == 'fello']
print(set(fel3))

{'fellas', 'fellat'}


In [76]:
#counting 'fello' 

felcount3 = Counter(fel3)
print(felcount3)

Counter({'fellat': 4, 'fellas': 1})


In [84]:
#occurrences of habeo

hab3 = [item[0] for item in inscriptions_lemmas if item[1] == 'habeo']
print(set(hab3))

{'habeto', 'habeas'}


In [85]:
#counting 'habeo' 

habcount3 = Counter(hab3)
print(habcount3)

Counter({'habeas': 1, 'habeto': 1})


In [57]:
inscriptions_wordcount = len(inscriptions_lemmas)
print(inscriptions_wordcount)

1216


In [58]:
#graffiti table of occurrences of 'futuo'

inscriptionsfcount = OrderedDict([ ('futuo forms', ['futui', 'futuit', 'fututa', 'futues', 'futuere', ' ', 'total count', 'total tokens', '% of total']), 
                            ('occurrences', [4, 4, 1, 1, 1, '', 11, 1216, 0.90]) ] )
    
df3 = pd.DataFrame.from_dict(inscriptionsfcount)
print(df3)

    futuo forms occurrences
0         futui           4
1        futuit           4
2        fututa           1
3        futues           1
4       futuere           1
5                          
6   total count          11
7  total tokens        1216
8    % of total         0.9
