<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Preprocesamiento con NLTK y Spacy


In [1]:
import json
import string
import random 

import numpy as np

### Datos

In [2]:
simple_text = "if she leaves now she might miss something important!"

In [3]:
large_text = "Patients who in late middle age have smoked 20 cigarettes a day since their teens constitute an at-risk group. One thing they’re clearly at risk for is the acute sense of guilt that a clinician can incite, which immediately makes a consultation tense."

### 1 - Preprocesamiento con NLTK
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords

# Descargar tokenizador punkt
nltk.download("punkt")
# Descargar diccionario de inglés
nltk.download("wordnet")
# Descargar diccionario de stopwords
nltk.download('stopwords')
# Para usar NLTK 3.6.6 o superior es necesario instalar OMW 1.4 
# (Open Multilingual WordNet)
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/flor/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/flor/nltk_data...
[nltk_data] Downloading package stopwords to /Users/flor/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /Users/flor/nltk_data...


True

In [5]:
simple_text

'if she leaves now she might miss something important!'

In [6]:
# Crear el derivador
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
p_stemmer = PorterStemmer()

In [7]:
# Crear el lematizador
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
# Extraer los tokens de un doc
tokens = word_tokenize(simple_text)
print("Tokens:", tokens)

Tokens: ['if', 'she', 'leaves', 'now', 'she', 'might', 'miss', 'something', 'important', '!']


In [9]:
# Transformar los tokens a sus respectivas palabras derivadas
# Stemming
nltk_stemedList = []
for word in tokens:
    nltk_stemedList.append(p_stemmer.stem(word))
print("Stemming:", nltk_stemedList)

Stemming: ['if', 'she', 'leav', 'now', 'she', 'might', 'miss', 'someth', 'import', '!']


In [10]:
# Transformar los tokens a sus respectivas palabras raiz
# Lemmatization
nltk_lemmaList = []
for word in tokens:
    nltk_lemmaList.append(lemmatizer.lemmatize(word))
print("Lemmatization:", nltk_lemmaList)

Lemmatization: ['if', 'she', 'leaf', 'now', 'she', 'might', 'miss', 'something', 'important', '!']


In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
# Quitar los signos de puntuacion
nltk_punctuation = [w for w in nltk_lemmaList if w not in string.punctuation]
print("Punctuation filter:", nltk_punctuation)

Punctuation filter: ['if', 'she', 'leaf', 'now', 'she', 'might', 'miss', 'something', 'important']


In [13]:
nltk_stop_words = set(stopwords.words("english"))
len(nltk_stop_words)

179

In [14]:
# Stop words
nltk_stop_words = set(stopwords.words("english"))
filtered_sentence = [w for w in nltk_punctuation if w not in nltk_stop_words]
print("Stop words filter:", filtered_sentence)

Stop words filter: ['leaf', 'might', 'miss', 'something', 'important']


### 2 - Proceso completo con NLTK
Tokenization → Lemmatization → Remove stopwords → Remove punctuation

In [15]:
def nltk_process(text):
    # Tokenization
    nltk_tokenList = word_tokenize(text)
      
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    nltk_lemmaList = []
    for word in nltk_tokenList:
        nltk_lemmaList.append(lemmatizer.lemmatize(word))
    
    print("Lemmatization")
    print(nltk_lemmaList)

    # Stop words
    nltk_stop_words = set(stopwords.words("english"))
    filtered_sentence = [w for w in nltk_lemmaList if w not in nltk_stop_words]

    # Filter Punctuation
    filtered_sentence = [w for w in filtered_sentence if w not in string.punctuation]

    print(" ")
    print("Remove stopword & Punctuation")
    print(filtered_sentence)
    return filtered_sentence

In [16]:
nltk_text = nltk_process(large_text)
print("Text len:", len(nltk_text))

Lemmatization
['Patients', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoked', '20', 'cigarette', 'a', 'day', 'since', 'their', 'teen', 'constitute', 'an', 'at-risk', 'group', '.', 'One', 'thing', 'they', '’', 're', 'clearly', 'at', 'risk', 'for', 'is', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
 
Remove stopword & Punctuation
['Patients', 'late', 'middle', 'age', 'smoked', '20', 'cigarette', 'day', 'since', 'teen', 'constitute', 'at-risk', 'group', 'One', 'thing', '’', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'make', 'consultation', 'tense']
Text len: 27


### 3 - Proceso completo con spaCy
Tokenization → Lemmatization → Remove stopwords → Remove punctuation

In [20]:
# !pip3 install spacy
# !python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m


Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [21]:
import spacy
# Cargar pipeline de preprocesamiento de inglés
nlp = spacy.load('en_core_web_sm')

def spacy_process(text):
    doc = nlp(text)
    
    # Tokenization & lemmatization
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    # Stop words
    filtered_sentence =[]
    for word in lemma_list:
        # word es un string, para recuperar la información de los objetos de SpaCy
        # necesitamos usar el string para pasar a un lexema, el objeto de SpaCy
        # que para cada término contiene la información del preprocesamiento
        # (se podría también directamente filtrar stopwords en el paso de lematización)
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    # Filter punctuation
    filtered_sentence = [w for w in filtered_sentence if w not in string.punctuation]

    print(" ")
    print("Remove stopword & punctuation: ")
    print(filtered_sentence)
    return filtered_sentence

In [22]:
spacy_text = spacy_process(large_text)
print("Text len:", len(nltk_text))

Tokenize+Lemmatize:
['patient', 'who', 'in', 'late', 'middle', 'age', 'have', 'smoke', '20', 'cigarette', 'a', 'day', 'since', 'their', 'teen', 'constitute', 'an', 'at', '-', 'risk', 'group', '.', 'one', 'thing', 'they', '’re', 'clearly', 'at', 'risk', 'for', 'be', 'the', 'acute', 'sense', 'of', 'guilt', 'that', 'a', 'clinician', 'can', 'incite', ',', 'which', 'immediately', 'make', 'a', 'consultation', 'tense', '.']
 
Remove stopword & punctuation: 
['patient', 'late', 'middle', 'age', 'smoke', '20', 'cigarette', 'day', 'teen', 'constitute', 'risk', 'group', 'thing', 'clearly', 'risk', 'acute', 'sense', 'guilt', 'clinician', 'incite', 'immediately', 'consultation', 'tense']
Text len: 27


### 4 - Conclusiones
- NLTK no pasa a minúsculas el texto por su cuenta
- spacy algunas palabras las reemplaza por su Tag (como "'")
- spacy descompone palabras

In [24]:
# !pip3 install prettytable

Collecting prettytable
  Downloading prettytable-3.4.1-py3-none-any.whl (26 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.4.1


In [25]:
from prettytable import PrettyTable
table = PrettyTable(['NLTK', 'spaCy'])
for nltk_word, spacy_word in zip(nltk_text, spacy_text):
    table.add_row([nltk_word, spacy_word])
print(table)

+------------+--------------+
|    NLTK    |    spaCy     |
+------------+--------------+
|  Patients  |   patient    |
|    late    |     late     |
|   middle   |    middle    |
|    age     |     age      |
|   smoked   |    smoke     |
|     20     |      20      |
| cigarette  |  cigarette   |
|    day     |     day      |
|   since    |     teen     |
|    teen    |  constitute  |
| constitute |     risk     |
|  at-risk   |    group     |
|   group    |    thing     |
|    One     |   clearly    |
|   thing    |     risk     |
|     ’      |    acute     |
|  clearly   |    sense     |
|    risk    |    guilt     |
|   acute    |  clinician   |
|   sense    |    incite    |
|   guilt    | immediately  |
| clinician  | consultation |
|   incite   |    tense     |
+------------+--------------+
