# Libraries

In [None]:
import re # import re - a module that provides support for regular expressions
import nltk
import spacy
from nltk import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')
!python -m spacy download es_core_news_sm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Read Text

In [1]:
# text in English
text1 = """Hello Everybody. Welcome to the NLP class, hope you enjoy it a lot. Remember to do all your activities and assign homework. Have a nice day!"""
text2 = "Ethics are built right into the ideals and objectives of the United Nations "
text3 = "To be or not to be"
text4 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG #UN @NY Society for Ethical Culture bit.ly/2guVelr'

In [None]:
# texto en Español
texto1 = """Hola a todos. Bienvenidos a la clase de NLP, ojalá disfruten la mataeria. Recuerden realizar todas las actividades y hacer los ejercicios propuestos. Que pasen un feliz día!"""

# Working with Text
## Text Cleaning

In [None]:
print(len(text1))
print(len(text2))
print(len(text3))
print(len(text4))
print(len(texto1))

140
76
18
134
173


In [None]:
# Return a list of the words, separating by ' '.
text_split1 = text1.split(' ')
len(text_split1)

26

In [None]:
text_split2 = text2.split(' ')
len(text_split2)

14

In [None]:
text_split3 = text3.split(' ')
len(text_split3)

6

In [None]:
text_split4 = text4.split(' ')
len(text_split4)

21

In [None]:
texto1_split = texto1.split(' ')
len(texto1_split)

28

In [None]:
# List comprehension allows us to find specific words:
# Capitalized words
[w for w in text_split1 if w.istitle()]

['Hello', 'Everybody.', 'Welcome', 'Remember', 'Have']

In [None]:
# Words that end in 's'
[w for w in text_split2 if w.endswith('s')]

['Ethics', 'ideals', 'objectives', 'Nations']

In [None]:
len(text_split3)

6

In [None]:
# Set of words
set(text_split3)

{'To', 'be', 'not', 'or', 'to'}

In [None]:
len(set([w.lower() for w in text_split3]))

4

In [None]:
# Finding hashtags:
[w for w in text_split4 if w.startswith('@')]

['@NY']

In [None]:
[w for w in text_split4 if re.search("#", w)]

['#UNSG', '#UN']

In [None]:
# Words that are greater than 3 letters long
[w for w in text_split2 if len(w) > 3]

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

# Tokens

In [None]:
# tokenize using sentences
sentence = nltk.sent_tokenize(text1)
for sen in sentence:
    print(sen)

Hello Everybody.
Welcome to the NLP class, hope you enjoy it a lot.
Remember to do all your activities and assign homework.
Have a nice day!


In [None]:
# tokenize using words
word = nltk.word_tokenize(text1)
for w in word:
    print(w)

Hello
Everybody
.
Welcome
to
the
NLP
class
,
hope
you
enjoy
it
a
lot
.
Remember
to
do
all
your
activities
and
assign
homework
.
Have
a
nice
day
!


In [None]:
# tokenize using regular expressions
word_regexp = nltk.regexp_tokenize(text1, "[\w']+")
for w in word_regexp:
    print(w)

Hello
Everybody
Welcome
to
the
NLP
class
hope
you
enjoy
it
a
lot
Remember
to
do
all
your
activities
and
assign
homework
Have
a
nice
day


In [None]:
token_espanol = nltk.data.load('tokenizers/punkt/spanish.pickle')
enunciado = token_espanol.tokenize(texto1)
for sen in enunciado:
    print(sen)

Hola a todos.
Bienvenidos a la clase de NLP, ojalá disfruten la mataeria.
Recuerden realizar todas las actividades y hacer los ejercicios propuestos.
Que pasen un feliz día!


In [None]:
tokens_palabras = nltk.word_tokenize(texto1)
for w in tokens_palabras:
    print(w)

Hola
a
todos
.
Bienvenidos
a
la
clase
de
NLP
,
ojalá
disfruten
la
mataeria
.
Recuerden
realizar
todas
las
actividades
y
hacer
los
ejercicios
propuestos
.
Que
pasen
un
feliz
día
!


# Stop words

In [None]:
sw_english = set(nltk.corpus.stopwords.words('english'))
print(sw_english)

{'her', "aren't", 'won', "weren't", 'she', 'for', 'hers', 'up', 'by', 'aren', 'if', 'out', 'at', 't', 'ourselves', 'him', 'than', "you're", 'having', 'in', 'from', 'under', 's', 'whom', 'below', 'hasn', 'ours', "don't", 'll', 'has', 'now', 'are', 'against', 'all', "didn't", 'on', 'as', 'yourself', 'more', "needn't", 'any', 'so', 'such', 'some', 'to', "you've", "hasn't", 'myself', 'until', "it's", 'above', 'because', 'and', 'nor', "wouldn't", 'most', 've', 'am', 'few', 'there', 'isn', 'my', 'o', 'again', 'an', 'what', 'itself', 'is', 'did', 'm', 'herself', 'himself', "shan't", 'about', 'i', 'with', 'down', 'very', 'had', "doesn't", 'we', 'be', 'both', 'not', 'our', "you'll", 'd', "haven't", 'over', 'or', 'was', 'should', 'during', "that'll", "mightn't", 'this', 'too', "should've", 'shan', 'yourselves', 'only', 'does', 'me', 'once', 'the', 'themselves', 'each', 'off', 'ma', 'just', 'into', 'how', 'weren', 'they', 'through', 'don', 'his', 'can', 'haven', 'these', 'being', 'you', 'your', '

In [None]:
sw_espanol = set(nltk.corpus.stopwords.words('spanish'))
print(sw_espanol)

{'nada', 'esta', 'hubiese', 'desde', 'fueron', 'ese', 'tuyo', 'habiendo', 'tuviésemos', 'habrás', 'estaremos', 'tenemos', 'vuestro', 'hay', 'tuvierais', 'tenidas', 'e', 'este', 'tuvimos', 'otras', 'su', 'estados', 'estadas', 'estos', 'hubiéramos', 'seríamos', 'muchos', 'sois', 'sentida', 'hayamos', 'tendrían', 'yo', 'seamos', 'serás', 'sí', 'fuiste', 'estuvo', 'donde', 'habían', 'estuvimos', 'qué', 'mía', 'sería', 'fuéramos', 'suya', 'fueran', 'las', 'del', 'hubieron', 'que', 'unos', 'tendríais', 'estés', 'serías', 'otra', 'tuyos', 'somos', 'también', 'quien', 'he', 'habrías', 'fue', 'estuvierais', 'estuve', 'esa', 'suyos', 'no', 'habíais', 'tendréis', 'habrían', 'se', 'estarían', 'están', 'estoy', 'hubierais', 'fuisteis', 'estuviéramos', 'otros', 'hubo', 'tienes', 'tendremos', 'esto', 'vuestras', 'estas', 'vosotros', 'tenida', 'habéis', 'esos', 'mi', 'habré', 'tuvieron', 'algo', 'tuyas', 'estaré', 'él', 'seré', 'vuestra', 'nosotros', 'tuviesen', 'algunos', 'hubieran', 'te', 'o', 'esta

In [None]:
for w in word:
    if w.lower() not in sw_english:
        print("Is not a SW = ", w)

Is not a SW =  Hello
Is not a SW =  Everybody
Is not a SW =  .
Is not a SW =  Welcome
Is not a SW =  NLP
Is not a SW =  class
Is not a SW =  ,
Is not a SW =  hope
Is not a SW =  enjoy
Is not a SW =  lot
Is not a SW =  .
Is not a SW =  Remember
Is not a SW =  activities
Is not a SW =  assign
Is not a SW =  homework
Is not a SW =  .
Is not a SW =  nice
Is not a SW =  day
Is not a SW =  !


In [None]:
for palabra in tokens_palabras:
    if palabra.lower() not in sw_espanol:
        print("No es SW = ", palabra)

No es SW =  Hola
No es SW =  .
No es SW =  Bienvenidos
No es SW =  clase
No es SW =  NLP
No es SW =  ,
No es SW =  ojalá
No es SW =  disfruten
No es SW =  mataeria
No es SW =  .
No es SW =  Recuerden
No es SW =  realizar
No es SW =  todas
No es SW =  actividades
No es SW =  hacer
No es SW =  ejercicios
No es SW =  propuestos
No es SW =  .
No es SW =  pasen
No es SW =  feliz
No es SW =  día
No es SW =  !


## Homework

Remove punctuation marks

In [None]:
my_list = {'a', 'b', 'c'}
my_list.update(['d', 'e'])
print(my_list)

{'a', 'd', 'e', 'b', 'c'}


# Punctuation marks

In [None]:
for w in word:
    if w.lower() not in sw_english and w.isalnum():
        print("Is not a SW = ", w)

Is not a SW =  Hello
Is not a SW =  Everybody
Is not a SW =  Welcome
Is not a SW =  NLP
Is not a SW =  class
Is not a SW =  hope
Is not a SW =  enjoy
Is not a SW =  lot
Is not a SW =  Remember
Is not a SW =  activities
Is not a SW =  assign
Is not a SW =  homework
Is not a SW =  nice
Is not a SW =  day


In [None]:
for palabra in tokens_palabras:
    if palabra.lower() not in sw_espanol and palabra.isalnum():
        print("No es SW = ", palabra)

No es SW =  Hola
No es SW =  Bienvenidos
No es SW =  clase
No es SW =  NLP
No es SW =  ojalá
No es SW =  disfruten
No es SW =  mataeria
No es SW =  Recuerden
No es SW =  realizar
No es SW =  todas
No es SW =  actividades
No es SW =  hacer
No es SW =  ejercicios
No es SW =  propuestos
No es SW =  pasen
No es SW =  feliz
No es SW =  día


In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
for w in word:
    if w.lower() not in sw_english and w.lower() not in string.punctuation:
        print("Is not a SW = ", w)

Is not a SW =  Hello
Is not a SW =  Everybody
Is not a SW =  Welcome
Is not a SW =  NLP
Is not a SW =  class
Is not a SW =  hope
Is not a SW =  enjoy
Is not a SW =  lot
Is not a SW =  Remember
Is not a SW =  activities
Is not a SW =  assign
Is not a SW =  homework
Is not a SW =  nice
Is not a SW =  day


In [None]:
# update sw list with punctuation marks
sw_english.update(['.', ',', ':',';','?','!','(',')','[',']','{','}'])
print(sw_english)

{'m', 'will', 'then', 'whom', 'such', 'my', 'of', 'him', 'this', 'than', 'at', 'don', 'further', 'doesn', 'other', 'out', 'how', "doesn't", 'above', 'himself', 'from', 'the', "hasn't", '.', 'here', 'was', 'do', "she's", 'y', 'i', 'didn', 'if', 'what', 'nor', "mustn't", 'while', 'o', "wouldn't", 'in', 'hadn', 'against', "hadn't", 'been', 'she', 'below', 'weren', "aren't", 'had', "couldn't", 'themselves', 'an', 'off', 'very', 'into', 'ain', 'only', 'because', 'd', 'hasn', 'wasn', 'some', 'not', 'for', 'just', 'is', 'our', 're', 'as', 'up', 'both', 'no', 'am', 'myself', 'them', 'any', 'isn', 'once', 'during', 'down', "needn't", 'again', 'can', 'hers', 'haven', 'its', "isn't", 'where', 'on', "that'll", "shan't", 'until', 'did', 'each', 't', 'a', "you're", 'why', 'those', 'needn', "won't", 'theirs', 'their', 'when', 'more', ':', 'itself', 'your', "wasn't", "shouldn't", 'mightn', 'these', 'that', 'yourself', 'herself', "haven't", 'does', 'yourselves', 'too', "don't", 'couldn', 'ma', 'shouldn

In [None]:
for w in word:
    if w.lower() not in sw_english:
        print("Is not a SW = ", w)

Is not a SW =  Hello
Is not a SW =  Everybody
Is not a SW =  Welcome
Is not a SW =  NLP
Is not a SW =  class
Is not a SW =  hope
Is not a SW =  enjoy
Is not a SW =  lot
Is not a SW =  Remember
Is not a SW =  activities
Is not a SW =  assign
Is not a SW =  homework
Is not a SW =  nice
Is not a SW =  day


In [None]:
sw_espanol.update(['.', ',', ':',';','?','!','(',')','[',']','{','}'])
print(sw_espanol)

{'pero', 'fueras', 'este', 'también', 'por', 'hubisteis', 'seríamos', 'mi', 'tendremos', 'serías', 'hayan', 'muy', 'hubieses', 'estábamos', 'tuvieseis', 'algunos', 'estados', 'tuviste', 'estuviste', 'sobre', 'durante', 'hayas', 'otra', 'están', 'estaríais', 'hubiéramos', 'eres', 'tendrán', 'unos', 'ellas', 'mío', 'estuviésemos', 'tus', 'ese', 'nos', 'tu', 'estuvimos', 'fueron', 'habéis', 'fueran', 'tanto', 'tuyos', ':', 'sí', 'contra', 'estuvieseis', 'tienen', 'otras', 'esa', 'quienes', 'seré', 'habíamos', 'cual', 'nuestros', 'tenida', 'nosotros', 'habré', 'habrían', 'estos', '!', 'he', 'nosotras', 'habrán', 'hayamos', 'estás', 'estaban', 'tuyas', 'teníais', 'habrías', 'me', '(', 'más', 'estuvierais', 'estuviera', 'hayáis', 'estuvo', 'tenido', '{', ';', 'estaremos', 'sentida', 'entre', '[', 'estada', 'ni', 'habrás', 'tuviese', 'hubieseis', 'tenía', 'sintiendo', 'se', 'somos', 'fuera', 'tienes', 'tendría', 'seas', 'suya', 'estamos', 'los', 'eran', 'tenga', 'y', 'vuestro', 'todo', 'había

In [None]:
for palabra in tokens_palabras:
    if palabra.lower() not in sw_espanol:
        print("No es SW = ", palabra)

No es SW =  Hola
No es SW =  Bienvenidos
No es SW =  clase
No es SW =  NLP
No es SW =  ojalá
No es SW =  disfruten
No es SW =  mataeria
No es SW =  Recuerden
No es SW =  realizar
No es SW =  todas
No es SW =  actividades
No es SW =  hacer
No es SW =  ejercicios
No es SW =  propuestos
No es SW =  pasen
No es SW =  feliz
No es SW =  día


# Stemming

In [None]:
stemmer = SnowballStemmer('english')
for w in word:
    print(stemmer.stem(w))

hello
everybodi
.
welcom
to
the
nlp
class
,
hope
you
enjoy
it
a
lot
.
rememb
to
do
all
your
activ
and
assign
homework
.
have
a
nice
day
!


In [None]:
stemmer_espanol = SnowballStemmer('spanish')
for palabra in tokens_palabras:
    print(stemmer_espanol.stem(palabra))

hol
a
tod
.
bienven
a
la
clas
de
nlp
,
ojal
disfrut
la
mataeri
.
recuerd
realiz
tod
las
activ
y
hac
los
ejercici
propuest
.
que
pas
un
feliz
dia
!


# Lemmatization

In [None]:
nlp_english = spacy.load('en_core_web_sm')
for w in word:
    doc = nlp_english(w)
    for token in doc:
        print(token.lemma_)

hello
everybody
.
welcome
to
the
NLP
class
,
hope
you
enjoy
it
a
lot
.
remember
to
do
all
your
activity
and
assign
homework
.
have
a
nice
day
!


In [None]:
nlp_espanol = spacy.load('es_core_news_sm')
for palabra in tokens_palabras:
    doc = nlp_espanol(palabra)
    for token in doc:
        print(token.lemma_)

Hola
a
todo
.
bienvenido
a
el
clase
de
NLP
,
ojalá
disfrutir
el
mataerio
.
recuerdir
realizar
todo
el
actividad
y
hacer
el
ejercicio
propuesto
.
que
pasen
uno
feliz
día
!


# Contractions

In [None]:
import pandas as pd
!pip install contractions
import contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K

In [None]:
# text in English
text5 = "I'm happy he's here. I can't wait to start!"
text6 = "She's so beautiful, I'm starting to feel in love!"
text7 = "What's happening, is there something wrong?"

In [None]:
df = pd.DataFrame({'text': [text5, text6, text7]})
df

Unnamed: 0,text
0,I'm happy he's here. I can't wait to start!
1,"She's so beautiful, I'm starting to feel in love!"
2,"What's happening, is there something wrong?"


In [None]:
print(contractions.fix(text5))
print(contractions.fix(text6))
print(contractions.fix(text7))

I am happy he is here. I cannot wait to start!
She is so beautiful, I am starting to feel in love!
What is happening, is there something wrong?


In [None]:
for i in range(len(df)):
  df.loc[i, 'text'] = contractions.fix(df.loc[i, 'text'])
df

Unnamed: 0,text
0,I am happy he is here. I cannot wait to start!
1,"She is so beautiful, I am starting to feel in ..."
2,"What is happening, is there something wrong?"


In [None]:
contractions_hedict_en = {"can't": "cannot", "I'm": "I am", "he's": "he is", "she's": "she is", "it's": "it is", "what's": "what is", "I'll" : "I will"}

In [None]:
for contraction, expansion in contractions_dict_en.items():
    text5 = text5.replace(contraction, expansion)
    text6 = text6.replace(contraction, expansion)
    text7 = text7.replace(contraction, expansion)
print(text5)
print(text6)
print(text7)

I am happy he is here. I can't wait to start!
She is so beautiful, I am starting to feel in love!
What's happening, is there something wrong?


In [None]:
for i in range(len(df)):
    for contraction, expansion in contractions_dict_en.items():
        df.loc[i, 'text'] = df.loc[i, 'text'].replace(contraction, expansion)
df

Unnamed: 0,text
0,I am happy he is here. I can't wait to start!
1,"She is so beautiful, I am starting to feel in ..."
2,"What's happening, is there something wrong?"


In [None]:
# texto en Español
texto2 = "Vamos corriendo a jugar pa'l parque."
texto3 = "Tenemos que estudiar pa'que nos vaya bien en el examen."
texto4 = "El elvador va pa'rriba o pa'bajo?"

In [None]:
contractions_dict_es = {"pa'l": "para el", "pa'que": "para que", "pa'rriba": "para arriba", "pa'bajo": "para abajo"}

In [None]:
for contraction, expansion in contractions_dict_es.items():
    texto2 = texto2.replace(contraction, expansion)
    texto3 = texto3.replace(contraction, expansion)
    texto4 = texto4.replace(contraction, expansion)
print(texto2)
print(texto3)
print(texto4)

Vamos corriendo a jugar para el parque.
Tenemos que estudiar para que nos vaya bien en el examen.
El elvador va para arriba o para abajo?


# Slangs and idioms

## Example 1

In [None]:
%pip install englishidioms
from englishidioms import find_idioms

Collecting englishidioms
  Downloading englishidioms-0.1.0-py3-none-any.whl.metadata (11 kB)
Downloading englishidioms-0.1.0-py3-none-any.whl (30.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.9/30.9 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: englishidioms
Successfully installed englishidioms-0.1.0


In [None]:
# text in English
text8 = "your package is in the last mile"

In [None]:
results = find_idioms(text8, limit=2)
for i in results:
    print(i)

{'phrase': 'pack someone or something in†', 'definition': 'to press or push someone or something into something; to manage to get a lot of things or people into a place. _ The boys packed a lot of kids into a telephone booth as a gag. _ They packed in a lot of kids.'}
{'phrase': 'pack something in something', 'definition': 'to surround or enclose something in something. _ They packed his wounded hand in ice, then took him to the hospital. _Pack the vase in shredded paper before you close the box.'}


## Example 2

In [None]:
# texto en Español
texto5 = "Tu qué, te crees el muy muy?"

In [None]:
slangs_dict_es = {"muy muy": "muy bueno"}

In [None]:
for slang, phrase in slangs_dict_es.items():
    texto5 = texto5.replace(slang, phrase)
print(texto5)

Tu qué, te crees el muy bueno?


In [None]:
texto6 = "Hizimos poco tiempo en el camino, en realidad estábamos muy muy cerca!"

In [None]:
for slang, phrase in slangs_dict_es.items():
    texto6 = texto6.replace(slang, phrase)
print(texto6)

Hizimos poco tiempo en el camino, en realidad estábamos muy bueno cerca!
