# Text analysis: word split, count and lemmatization

## Text functions

In [1]:
#Need: re, simplemma, pandas
import importlib

if "txt_mod" in globals():
    importlib.reload(txt_mod)
else:
    import text_local_module as txt_mod


## PDF functions

In [2]:
#Need: pypdf
if "pdf_mod" in globals():
    importlib.reload(pdf_mod)
else:
    import pdf_local_module as pdf_mod


## Use cases

### 1) Analyzing a text string

#### 1.1) Creating the text object

In [5]:
input_text = "Io mi  andare andai  molto     prova  chiamo.Mamma ciao bello bello \n,. /tanto bello mi ciao!!! So happy,vado all'ippodromo!,ddd \n\n\n\nmi"            
input_language = "it"
verbose_option = (True,False)[0]
text_object = txt_mod.text_analyzer(input_text, input_language, verbose = verbose_option)
text_object.language_detect()
text_object.text_raw


The language detected is it, with confidence 0.6923076923076923 (between 0 and 1).


"Io mi  andare andai  molto     prova  chiamo.Mamma ciao bello bello \n,. /tanto bello mi ciao!!! So happy,vado all'ippodromo!,ddd \n\n\n\nmi"

#### 1.2) Counting the words

In [6]:
print("\nA dictionary with all counted words is given by:\n", text_object.word_count(dict=True))
print()
text_object.word_count_print("andai")
text_object.word_count_print("test")


I need pandas. I'm executing 'import pandas as pd'.
Please wait: I am processing the text.
Please wait: I am splitting the words.
Please wait: I am counting the words.

A dictionary with all counted words is given by:
 {'molto': 1, 'mi': 3, 'io': 1, 'ciao': 2, 'vado': 1, 'prova': 1, 'chiamo': 1, 'mamma': 1, 'all': 1, 'ddd': 1, 'tanto': 1, 'bello': 3, 'so': 1, 'happy': 1, 'ippodromo': 1, 'andare': 1, 'andai': 1}

The word 'andai' occurs 1 times.
The word 'test' occurs 0 times.


#### 1.2) Displaying the counted words as a dataframe

In [7]:
text_object.word_count().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Raw,molto,mi,io,ciao,vado,prova,chiamo,mamma,all,ddd,tanto,bello,so,happy,ippodromo,andare,andai
Raw Count,1,3,1,2,1,1,1,1,1,1,1,3,1,1,1,1,1


#### 1.3) Lemmatizing the words

In [8]:
display(text_object.lemmatize().transpose())

Please wait: I am lemmatizing the text.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Raw,molto,mi,io,ciao,vado,prova,chiamo,mamma,al,ddd,tanto,bello,so,happy,ippodromo,andare,andai
Lemma,molto,mi,io,ciao,andare,prova,chiamare,mamma,al,ddd,tangere,bello,sapere,happy,ippodromo,andare,andare
Raw Count,1,3,1,2,1,1,1,1,1,1,1,3,1,1,1,1,1


#### 1.4) Counting the lemmas

In [9]:
display(text_object.lemma_count().transpose())

Please wait: I am counting the lemmas.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Lemma,al,andare,bello,chiamare,ciao,ddd,happy,io,ippodromo,mamma,mi,molto,prova,sapere,tangere
Lemma Count,1,3,3,1,2,1,1,1,1,1,3,1,1,1,1
Occurrences,[al],"[vado, andare, andai]",[bello],[chiamo],[ciao],[ddd],[happy],[io],[ippodromo],[mamma],[mi],[molto],[prova],[so],[tanto]


## 2) Analyzing a PDF

### 2.1) Creating a test PDF

In [57]:
import lorem
import textwrap
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

dummy_text = lorem.text()
for _ in range(0,10*3):
    dummy_text+="\n\n"+lorem.text()
    
dummy_text = dummy_text.split("\n\n")
    
file_name = "Test_IO\\test.pdf"
pdf = canvas.Canvas(file_name, pagesize=A4)
x_start = 100 
y_start = 800  
line_spacing = 15
line_width = 60 
y_position = y_start
page_height = A4[1]
bottom_margin = 50

dummy_text_wrapped = [textwrap.wrap(text, width=line_width) for text in dummy_text]

for paragraph in dummy_text_wrapped:
    for line in paragraph:
        #
        if y_position <= bottom_margin:
            pdf.showPage()
            y_position = y_start
        #
        pdf.drawString(x_start, y_position, line)
        y_position -= line_spacing
    y_position -= line_spacing

try:
    pdf.save()
except Exception as exc:
    print("I could not create the pdf. Error: ", exc)
else:
    print(f"I successfully created the PDF '{file_name}'")

I successfully created the PDF 'Test_IO\test.pdf'


### 2.1) Import text from a PDF

In [63]:
import os
directory = ["Personal_IO", "Test_IO"][1]
pdf_file_list = list(filter(lambda str: ".pdf" in str,os.listdir(directory)))
print(pdf_file_list)
chosen_filename = pdf_file_list[0]
pdf_object = pdf_mod.pdf_analyzer(directory+"\\"+chosen_filename)
print("The file",pdf_object.file_path, "has", pdf_object.n_pages,"page(s)")
pages_example_text = pdf_object.extract(1,-1, merge=True) 


['test.pdf']
The file Test_IO\test.pdf has 20 page(s)


#### 2.1.1) Lemmatizing the imported text

In [64]:
pages_example = txt_mod.text_analyzer(pages_example_text)

In [65]:
pages_example.lemmatize()

Please wait: I am processing the text.
Please wait: I am splitting the words.
Please wait: I am counting the words.
Please wait: I am lemmatizing the text.


Unnamed: 0,Raw,Lemma,Raw Count
0,etincidunt,etincidunt,230
1,aliquam,aliqui,241
2,ipsum,ipse,215
3,sed,sed,228
4,numquam,numquam,225
5,modi,modus,222
6,quisquam,quisquam,234
7,quaerat,quaero,210
8,dolorem,dolor,237
9,ut,ut,239


In [67]:
sorted_lemmas = pages_example.lemma_count().sort_values("Lemma Count", ascending=False)
display(sorted_lemmas)

replace_accents = dict({"à":"a'","á":"a'","é":"e'", "è":"e'","ì":"i'","í":"i'", "ó":"o'", "ò":"o'","ù":"u'", "ú":"u'"})
def replace_func_export(str):
    for rule in replace_accents:
        str = str.replace(rule,replace_accents[rule])
    return str

sorted_lemmas["Occurrences"] = sorted_lemmas["Occurrences"].apply(lambda my_list : ",".join(my_list))
sorted_lemmas["Occurrences"] = sorted_lemmas["Occurrences"].apply(replace_func_export)
sorted_lemmas["Lemma"] = sorted_lemmas["Lemma"].apply(replace_func_export)
try:
    sorted_lemmas.to_csv(directory+'\\'+(chosen_filename.replace(".pdf",""))+'.csv', index=False, quotechar='"', quoting=2)
    print("Data correctly saved.")
except Exception as exc:
    print("Problem saving the data: ", exc)

Unnamed: 0,Lemma,Lemma Count,Occurrences
4,dolor,743,"[dolorem, dolor, dolore]"
19,sum,449,"[sit, est]"
3,consector,253,[consectetur]
22,volo,244,[velit]
1,aliqui,241,[aliquam]
16,quiquia,240,[quiquia]
21,ut,239,[ut]
8,labor,235,[labore]
17,quisquam,234,[quisquam]
5,etincidunt,230,[etincidunt]


Data correctly saved.
