In [41]:
# NLP Pipeline is a set of steps followed to build an end to end NLP software

# 1. Data Acquisition
# 2. Text Preparation --> Clean Up, Basic Preprocessing, Advanced Preprocessing
# 3. Feature Engineering
# 4. Modelling --> Building and Evaluation
# 5. Deployment (Monitoring, Model Update)

# Giovana de Moura Dias Oliveira

In [42]:
# standart librarie for regular expressions
import re

poema = """
Ainda que mal pergunte, 
ainda que mal respondas; 
ainda que mal te entenda, 
ainda que mal repitas; 
ainda que mal insista, 
ainda que mal desculpes; 
ainda que mal me exprima, 
ainda que mal me julgues; 
ainda que mal me mostre, 
ainda que mal me vejas; """

# findall is used to find all the occurences 
# the 'r' is used to differenciate a re from a text
re.findall(r"mal", poema)

['mal', 'mal', 'mal', 'mal', 'mal', 'mal', 'mal', 'mal', 'mal', 'mal']

In [43]:
# I can puut it in a len() function to count the occurences
occurencies = len(re.findall(r"mal", poema))
print(occurencies)

10


In [44]:
# Disjunction is a method to define more than 1 possible character in a RE position
# This is defined between ()

In [45]:
# Finding how many times 'ainda que mal' appears in the text using disjunction
sentence = re.findall(r'[Aa]inda que mal', poema)
len(sentence)

10

In [46]:
# We can also ask for a range of letters or numbers
tv = ("""Smart TV LED 32" S5400AF TCL FHD Android TV - A Melhor Escolha para Seu Entretenimento!
Voc√™ est√° em busca de uma Smart TV com qualidade de imagem incr√≠vel e funcionalidades modernas?
A Smart TV LED 32" S5400AF TCL FHD Android TV √© a escolha perfeita para transformar a sua sala 
em um centro de entretenimento! Com a tecnologia Full HD (FHD), essa TV LED 32" oferece cores vivas, 
defini√ß√£o n√≠tida e uma experi√™ncia de visualiza√ß√£o imersiva.""")

# Finding all the number in the text
len(re.findall(r'[0-9]', tv))

14

In [47]:
# Checking the number of elements 
len(tv)

442

In [48]:
# Exceptions --> just like in math, we use the symbol '^' to denotate a negation (exception)

# Asking for all the elements, except the letter 'A' and 'a
len(re.findall(r'[^Aa]', tv))

401

In [49]:
# Times the letter 'a' appears in the text
minus = len(tv) - len(re.findall(r'[^Aa]', tv))
minus

41

In [50]:
# To search for more than one thing at a time
len(re.findall(r'[me|mal]', poema))

90

In [51]:
# Now, if you want something at the beggining or end of a line
# Use '^' and [$] 
# Use \b to delimitate a word --> \bAINDA QUE\b
 
# Now, the quantifiers, very useful for twitter and social media
# --> use 'a+' to get 1 or infinite 'a's. (ex: aaaa)
# --> use 'a+h!' to get (ahhh!, ahhhhh!)
# --> use 'objec?to' to get (objeto or objecto)
# --> use 'a{2,} to get (at least two 'a's)
# --> use 'a{2,3}' to get from 1 to 2 'a's
# --> use 'a+?' to get the lowest quantity possible

# Using () to agrupate
# --> ([Hh]a)+

In [52]:
amostra = """Hahahahhaha 
             Hahahahhahaha 
             Haha 
             HAHAHAH
             kkkk"""

regex = re.compile(r'[Kk]+|([Hh]a)+|([Hh]ua)+')
iterador = regex.finditer(amostra)

for ex in iterador:
    inicio, fim = ex.span()
    print(inicio, fim, amostra[inicio:fim])

0 6 Hahaha
7 11 haha
26 32 Hahaha
33 39 hahaha
54 58 Haha
94 98 kkkk


In [53]:
texto = """Reconhe√ßo que tomei decis√µes insatisfat√≥rias recentemente, mas posso lhe 
garantir com absoluta certeza que meu trabalho voltar√° ao normal."""

# Now, we have '.sub' to exchange some things 
# Here, the punctiation marks are being replaced by an empty space
print(re.sub(r'[,;.?!\n]', '', texto))

Reconhe√ßo que tomei decis√µes insatisfat√≥rias recentemente mas posso lhe garantir com absoluta certeza que meu trabalho voltar√° ao normal


In [54]:
# Let's take a CEP and revert the numbers

cep = '02125-000'

# 
re.sub(r'([0-9]{5})-([0-9]{3})', r'\2-\1', cep)

'000-02125'

In [55]:
# Building a function called 'busca'. Two parameter: 'query' and 'documents'
# --> The function must return all the texts from the docs that contain the words
# from query
# The 'resultado' variable receives the matches

def busca(query, documentos):

    resultado = [] 

    for doc in documentos: # going through all the elements in the variable 
        encontrou_todas = True # Let's assume the document is valid
        
        for palavra in query: # now we want to see if all the words from query are here
            
            if not re.search(rf'\b{re.escape(palavra)}\b', doc, re.IGNORECASE):
                 encontrou_todas = False
                 break # if the first word is already not there, we stop
                
        if encontrou_todas: # BUT if it has all the words...
                resultado.append(doc) # the document is appended to the list!
                
    return resultado
                     




# re.search --> first occurence 
# re.IGNORECASE --> well, ignore the case :)

    


In [56]:
# Now, testing the function

query = ["python", "programar"]

documentos =  ["Hoje √© um bom dia para estudar programa√ß√£o.",
    "Programar em Python pode ser divertido.",
    "Estudar express√µes regulares ajuda muito.",
    "Python √© uma linguagem poderosa.",
    "Aprender √© um processo cont√≠nuo."
]

resultado = busca(query, documentos)

print("üìÑ Documentos que combinam com a busca:")
for doc in resultado:
    print('--> ', doc)

üìÑ Documentos que combinam com a busca:
-->  Programar em Python pode ser divertido.


In [57]:
# For this exercise, we are doing a Web Crawler
# --> The function called 'extract' must receive a html page and return a list of
# the textual content from the paragraphs.

def extrair(pagina):
    # Getting the content between the <p> tags
    parag = re.findall(r'<p>(.*?)</p>', pagina, re.DOTALL)
    return parag

# re.DOTALL captures multiple lines
# (.*?) --> captures the smaller excert possible 
# .* any carachter, one or multiple times

In [58]:
# Testing the function

pagina = """
                <html>
                <head><title>Exemplo</title></head>
                <body>
                <p>Ol√°, LIG948B</p>
                <p>Este √© outro par√°grafo!</p>
                </body>
                </html>
                """

print(extrair(pagina))


['Ol√°, LIG948B', 'Este √© outro par√°grafo!']
