# Tina meet-up number 1: About Text Mining, 12/02/2021
A copy of this Notebook can be downloaded from our public Github <a href="https://github.com/ecalab-lux/Tina">https://github.com/ecalab-lux/Tina</a>

### The point of view proposed here is the engineer one: first try, then evaluate, maybe later study the theoretical aspects... Does it work?

![Joke](https://imgs.xkcd.com/comics/the_general_problem.png)
(c) Xkcd https://xkcd.com/974/

# Use case number 1: automatic summaries with Gensim

In [None]:
# open an ECA report from the website 
import requests

response = requests.get('https://www.eca.europa.eu/lists/ecadocuments/sr21_02/sr_education_in_emergencies_en.pdf')
print(response.status_code, response.reason, response.headers.get('Content-Type'))
report = response.content
response.close()

In [None]:
# convert downloaded PDF to text. Note: PDF is the worst possible format for extracting textual information
# because it was born as a format for printers and human readers

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from io import StringIO, BytesIO

report_text = StringIO()
parser = PDFParser(BytesIO(report))
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, report_text, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)
report_text = report_text.getvalue()

# print a portion of the text
print(report_text[3000:3500])

In [None]:
# Clean-up the text
import re

report_text = re.sub(r'([^.])\n',r'\1 ',report_text) # Remove newlines except if they are preceded by a full stop char
clean_text = ""
for t in report_text.split('\n'): # For each line of the text
    if len(t)>=30: # Keep it only if the line contains at least 30 characters
        clean_text = clean_text + t + '\n'

In [None]:
# Generate the summary
from gensim.summarization import summarize # conda install gensim

summary = summarize(clean_text,word_count=300) # Num word in the summary = 300
print(summary)

In [None]:
# Ok, it is now up to the professional judgment to decide if the automatic summary is useful or not
# Can it be used as an evidence? Probably not
# Can it suggest if the document is worth to be read? Probably Yes

# Use case number 2: POS (Part of speech) tagging with Spacy

In [None]:
# Let's take a portion of the previous report
partial_text = clean_text[5244:6126]
print(partial_text)

In [None]:
# Initialise the Natural Language Processor for the English language
import spacy # http://spacy.io
from spacy import displacy

nlp = spacy.load('en_core_web_lg') # https://spacy.io/usage/models#languages for a list of all available models

# Parse the text
doc = nlp(partial_text)

# Visualise some part of the elaboration
displacy.render(doc[0:36], style="dep")

In [None]:
# Isolate only Nouns in their singular form
pos_interesting_types = ['PROPN', 'NOUN']
ext_nouns = []
for token in doc:
    if (token.pos_ in pos_interesting_types):
        ext_nouns.append(token.lemma_)
print(doc)
print()
print(ext_nouns)

In [None]:
# Extract verbs, excluding modal ones
pos_interesting_types = ['VERB']
ext_verbs = []
for token in doc:
    if (token.pos_ in pos_interesting_types) and (token.tag_ != 'MD'): # Exclude modal verbs
        ext_verbs.append(token.lemma_)
print(doc)
print()
print(ext_verbs)

In [None]:
# Extract Noun chunks (=group of words that make sense together)
ext_chunks = []
for nc in doc.noun_chunks:
    ext_chunks.append(nc.text)
print(ext_chunks)

In [None]:
# Conclusion: POS tagging can be useful as part of a processing pipeline, alone does not produce interesting results
# Useful to produce better word clouds or textual statistical distributions

# Use case number 3: text similarities

In [None]:
# Convert report text into a table
import pandas as pd
pd.set_option('max_colwidth', None)

text_table = pd.DataFrame()
for line in clean_text.split('\n'):
    text_table = text_table.append({'Line': line}, ignore_index=True)

text_table.sample(10)

In [None]:
# Create a WordVector for each line of text
text_table['Vector'] = text_table['Line'].apply(lambda x: nlp(x).vector)
text_table.sample(3)

In [None]:
# Goal: for any given sentence, search for the "most similar" line in the original text
from sklearn.metrics.pairwise import cosine_similarity

#sentence = "Cost monitoring"
#sentence = "Gender balance"
sentence = "Importance of primary schools"
# Calculate the sentence vector
sentence_vec = nlp(sentence).vector
# Calculate the cosine similarity (=the closeness in the 100-dimensions space) between the sentence and all other lines in the text
similarities = cosine_similarity([sentence_vec], text_table['Vector'].to_list())
# Which line has max similarity?
print(text_table.loc[similarities.argmax(), 'Line'])

# The Stanford CoreNLP
## Live demo at https://corenlp.run/

In [None]:
# Open question: can we really satisfy the expectation of mining "textual meaning"?

In [None]:
# Open question: how to use statistical results with inevitable biases to derive audit findings?