# AS-01 Pré-processamento Textual
Assignment 01 da matéria de tópicos 01.
 - **Aluno**: Gustavo Martins Lopes da Costa
 - **Matrícula**: 690773


In [1]:
import re
import unicodedata
import nltk
import gensim
import spacy
import nltk.tokenize as tokenizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


from keras.preprocessing.text import Tokenizer
from textblob.tokenizers import WordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gumar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Obtendo o Dataset

In [2]:
DATA_URL = "datasets\Shakespeare.txt"

In [3]:
with open(DATA_URL, "r") as dataset_file:
  text = dataset_file.read()

In [4]:
text



In [5]:
def save_token_list(token_list, filename):
  with open(filename, "w") as result_file:
    for token in token_list:
      result_file.write(token + "\n")

## 1. Normalização
---

Lower Case Reduction

In [6]:
normalized_text = text.lower()
normalized_text



Accent and diacritic removal

In [7]:
def remove_accents(input_text):
  nfkd_form = unicodedata.normalize('NFKD', input_text)
  return u"".join([char for char in nfkd_form if not unicodedata.combining(char)])

In [8]:
normalized_text = remove_accents(normalized_text)
normalized_text



Canonicalizing of acronyms, currency, date and hyphenated words

In [9]:
normalized_text = re.sub('\.(?!(\$[^. ])\d)', '', normalized_text)
normalized_text



Punctuation and special character removal

In [10]:
normalized_text = re.sub('(?<!\d)[.,;!?\'\(\)#:-](?!\d)', '', normalized_text)
# Remoção de espaços extras
normalized_text = re.sub(' +', ' ', normalized_text)
normalized_text



Salvando o resultado

In [11]:
with open("Shakespeare_Normalized.txt", "w") as normalization_file:
  normalization_file.write(normalized_text)

## 2 - Tokenização
---

In [12]:
def save_tokenization(token_list, number):
  with open(f"Shakespeare_Normalized_Tokenized_{number}.txt", "w") as tokenization_file:
    for token in token_list:
      tokenization_file.write(token + "\n")

White Space Tokenization

In [13]:
Shakespeare_Normalized_Tokenized01 = normalized_text.split()
Shakespeare_Normalized_Tokenized01
save_token_list(Shakespeare_Normalized_Tokenized01,"Shakespeare_Normalized_Tokenized_01.txt")

NLTK: Word Tokenizer

In [14]:
Shakespeare_Normalized_Tokenized02 = tokenizer.word_tokenize(normalized_text)
Shakespeare_Normalized_Tokenized02
save_token_list(Shakespeare_Normalized_Tokenized02,"Shakespeare_Normalized_Tokenized_02.txt")

NLTK: Tree Bank Tokenizer

In [15]:
Shakespeare_Normalized_Tokenized03 = tokenizer.TreebankWordTokenizer().tokenize(normalized_text)
Shakespeare_Normalized_Tokenized03
save_token_list(Shakespeare_Normalized_Tokenized03,"Shakespeare_Normalized_Tokenized_03.txt")

NLTK: Word Punctuation Tokenizer

In [16]:
Shakespeare_Normalized_Tokenized04 = tokenizer.WordPunctTokenizer().tokenize(normalized_text)
Shakespeare_Normalized_Tokenized04
save_token_list(Shakespeare_Normalized_Tokenized04,"Shakespeare_Normalized_Tokenized_04.txt")

NLTK: Tweet Tokenizer

In [17]:
Shakespeare_Normalized_Tokenized05 = tokenizer.TweetTokenizer().tokenize(normalized_text)
Shakespeare_Normalized_Tokenized05
save_token_list(Shakespeare_Normalized_Tokenized05,"Shakespeare_Normalized_Tokenized_05.txt")

NLTK: MWE Tokenizer

In [18]:
Shakespeare_Normalized_Tokenized06 = tokenizer.MWETokenizer().tokenize(normalized_text)
Shakespeare_Normalized_Tokenized06
save_token_list(Shakespeare_Normalized_Tokenized06,"Shakespeare_Normalized_Tokenized_06.txt")

TextBlob Word Tokenizer

In [19]:
textblob_tokenizer = WordTokenizer()
Shakespeare_Normalized_Tokenized07  = textblob_tokenizer.tokenize(normalized_text)
Shakespeare_Normalized_Tokenized07
save_token_list(Shakespeare_Normalized_Tokenized07,"Shakespeare_Normalized_Tokenized_07.txt")

spaCy Tokenizer

In [20]:

spaCy_tokenizer = spacy.load("en_core_web_sm")
tokenized = spaCy_tokenizer(normalized_text)
Shakespeare_Normalized_Tokenized08 = [token.text for token in tokenized]
Shakespeare_Normalized_Tokenized08
save_token_list(Shakespeare_Normalized_Tokenized08,"Shakespeare_Normalized_Tokenized_08.txt")

Gensim Word Tokenizer

In [21]:
Shakespeare_Normalized_Tokenized09 = list(gensim.utils.tokenize(normalized_text))
Shakespeare_Normalized_Tokenized09
save_token_list(Shakespeare_Normalized_Tokenized09,"Shakespeare_Normalized_Tokenized_09.txt")

Keras Tokenization

In [22]:
# Transformação do texto para input no tokenizer
text_list = [normalized_text]

# Setup do tokenizer
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(text_list)

Shakespeare_Normalized_Tokenized10 = list(tokenizer.word_index.keys())
Shakespeare_Normalized_Tokenized10
save_token_list(Shakespeare_Normalized_Tokenized10,"Shakespeare_Normalized_Tokenized_10.txt")

## Stop-words Removal
---

In [23]:
sw = stopwords.words('english')
Shakespeare_Normalized_Tokenized_StopWord = [
    word for word in Shakespeare_Normalized_Tokenized02 if not word in sw
]

Shakespeare_Normalized_Tokenized_StopWord

['poor',
 'contempt',
 'claimd',
 'thou',
 'slept',
 'faithful',
 'may',
 'contrive',
 'father',
 'defeated',
 'queen',
 'flesh',
 'broke',
 'puttance',
 'expedition',
 'house',
 'ever',
 'lament',
 'stomach',
 'butly',
 'fury',
 'knowing',
 'everything',
 'grew',
 'daily',
 'ever',
 'great',
 'strength',
 'thought',
 'bright',
 'buds',
 'mine',
 'biondello',
 'marry',
 'may',
 'pray',
 'patience',
 'king',
 'lear',
 'instant',
 'common',
 'maid',
 'may',
 'less',
 'brave',
 'gentleman',
 'joiner',
 'finds',
 'us',
 'wax',
 'owe',
 'full',
 'presence',
 'fooder',
 'staves',
 'remorsed',
 'bridals',
 'man',
 'grace',
 'every',
 'business',
 'tongue',
 'thinking',
 'contends',
 'hath',
 'respected',
 'thee',
 'biron',
 'left',
 'thee',
 'ill',
 'die',
 'blessed',
 'reasonable',
 'nature',
 'honour',
 'bosom',
 'safe',
 'others',
 'speedybirth',
 'bill',
 'forestem',
 'richard',
 'heart',
 'questiond',
 'enough',
 'partier',
 'forth',
 'obsers',
 'dpunishd',
 'hate',
 'restraints',
 'woul

In [24]:
save_token_list(Shakespeare_Normalized_Tokenized_StopWord, "Shakespeare_Normalized_Tokenized_StopWord.txt")

## Text Lemmatization
---

In [25]:
lemmatizer = WordNetLemmatizer()
Shakespeare_Normalized_Tokenized_StopWord_Lemmatized = [
    lemmatizer.lemmatize(word) for word in Shakespeare_Normalized_Tokenized_StopWord
]
Shakespeare_Normalized_Tokenized_StopWord_Lemmatized

['poor',
 'contempt',
 'claimd',
 'thou',
 'slept',
 'faithful',
 'may',
 'contrive',
 'father',
 'defeated',
 'queen',
 'flesh',
 'broke',
 'puttance',
 'expedition',
 'house',
 'ever',
 'lament',
 'stomach',
 'butly',
 'fury',
 'knowing',
 'everything',
 'grew',
 'daily',
 'ever',
 'great',
 'strength',
 'thought',
 'bright',
 'bud',
 'mine',
 'biondello',
 'marry',
 'may',
 'pray',
 'patience',
 'king',
 'lear',
 'instant',
 'common',
 'maid',
 'may',
 'le',
 'brave',
 'gentleman',
 'joiner',
 'find',
 'u',
 'wax',
 'owe',
 'full',
 'presence',
 'fooder',
 'stave',
 'remorsed',
 'bridal',
 'man',
 'grace',
 'every',
 'business',
 'tongue',
 'thinking',
 'contends',
 'hath',
 'respected',
 'thee',
 'biron',
 'left',
 'thee',
 'ill',
 'die',
 'blessed',
 'reasonable',
 'nature',
 'honour',
 'bosom',
 'safe',
 'others',
 'speedybirth',
 'bill',
 'forestem',
 'richard',
 'heart',
 'questiond',
 'enough',
 'partier',
 'forth',
 'obsers',
 'dpunishd',
 'hate',
 'restraint',
 'would',
 'go

In [26]:
save_token_list(Shakespeare_Normalized_Tokenized_StopWord_Lemmatized, "Shakespeare_Normalized_Tokenized_StopWord_Lemmatized.txt")

## Text Stemming
---

Porter Stemmer

In [27]:
stemmer = PorterStemmer()
Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01 =  [
    stemmer.stem(word) for word in
    Shakespeare_Normalized_Tokenized_StopWord_Lemmatized
]
Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01

['poor',
 'contempt',
 'claimd',
 'thou',
 'slept',
 'faith',
 'may',
 'contriv',
 'father',
 'defeat',
 'queen',
 'flesh',
 'broke',
 'puttanc',
 'expedit',
 'hous',
 'ever',
 'lament',
 'stomach',
 'butli',
 'furi',
 'know',
 'everyth',
 'grew',
 'daili',
 'ever',
 'great',
 'strength',
 'thought',
 'bright',
 'bud',
 'mine',
 'biondello',
 'marri',
 'may',
 'pray',
 'patienc',
 'king',
 'lear',
 'instant',
 'common',
 'maid',
 'may',
 'le',
 'brave',
 'gentleman',
 'joiner',
 'find',
 'u',
 'wax',
 'owe',
 'full',
 'presenc',
 'fooder',
 'stave',
 'remors',
 'bridal',
 'man',
 'grace',
 'everi',
 'busi',
 'tongu',
 'think',
 'contend',
 'hath',
 'respect',
 'thee',
 'biron',
 'left',
 'thee',
 'ill',
 'die',
 'bless',
 'reason',
 'natur',
 'honour',
 'bosom',
 'safe',
 'other',
 'speedybirth',
 'bill',
 'forestem',
 'richard',
 'heart',
 'questiond',
 'enough',
 'partier',
 'forth',
 'obser',
 'dpunishd',
 'hate',
 'restraint',
 'would',
 'got',
 'partli',
 'autolycu',
 'hath',
 'sa

In [28]:
save_token_list(Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01, "Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming_01.txt")

Snowball Stemmer

In [29]:
stemmer = SnowballStemmer('english')
Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02 = [
    stemmer.stem(word) for word in
    Shakespeare_Normalized_Tokenized_StopWord_Lemmatized
]

Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02

['poor',
 'contempt',
 'claimd',
 'thou',
 'slept',
 'faith',
 'may',
 'contriv',
 'father',
 'defeat',
 'queen',
 'flesh',
 'broke',
 'puttanc',
 'expedit',
 'hous',
 'ever',
 'lament',
 'stomach',
 'but',
 'furi',
 'know',
 'everyth',
 'grew',
 'daili',
 'ever',
 'great',
 'strength',
 'thought',
 'bright',
 'bud',
 'mine',
 'biondello',
 'marri',
 'may',
 'pray',
 'patienc',
 'king',
 'lear',
 'instant',
 'common',
 'maid',
 'may',
 'le',
 'brave',
 'gentleman',
 'joiner',
 'find',
 'u',
 'wax',
 'owe',
 'full',
 'presenc',
 'fooder',
 'stave',
 'remors',
 'bridal',
 'man',
 'grace',
 'everi',
 'busi',
 'tongu',
 'think',
 'contend',
 'hath',
 'respect',
 'thee',
 'biron',
 'left',
 'thee',
 'ill',
 'die',
 'bless',
 'reason',
 'natur',
 'honour',
 'bosom',
 'safe',
 'other',
 'speedybirth',
 'bill',
 'forestem',
 'richard',
 'heart',
 'questiond',
 'enough',
 'partier',
 'forth',
 'obser',
 'dpunishd',
 'hate',
 'restraint',
 'would',
 'got',
 'part',
 'autolycus',
 'hath',
 'sat',

In [30]:
save_token_list(Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02, "Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming_02.txt")

## Análise do Vocabulário

In [31]:
import pandas as pd

### Análise do lemmatizador

In [32]:
# Obtendo os tokens
lem = Shakespeare_Normalized_Tokenized_StopWord_Lemmatized
# Obtendo as ocorrências
token_ocurrences_lem = [text.count(token) for token in lem]
# Obtendo os tamanhos de cada token
token_lengths_lem = [len(token) for token in lem]

In [33]:
# Passando os dados para o formato de lista
data_list_lem = [[tok, occ, len] for tok, occ, len in zip(lem, token_ocurrences_lem, token_lengths_lem)]
data_list_lem

[['poor', 19, 4],
 ['contempt', 4, 8],
 ['claimd', 0, 6],
 ['thou', 143, 4],
 ['slept', 2, 5],
 ['faithful', 2, 8],
 ['may', 27, 3],
 ['contrive', 1, 8],
 ['father', 23, 6],
 ['defeated', 1, 8],
 ['queen', 19, 5],
 ['flesh', 5, 5],
 ['broke', 4, 5],
 ['puttance', 1, 8],
 ['expedition', 2, 10],
 ['house', 15, 5],
 ['ever', 75, 4],
 ['lament', 4, 6],
 ['stomach', 3, 7],
 ['butly', 0, 5],
 ['fury', 3, 4],
 ['knowing', 2, 7],
 ['everything', 1, 10],
 ['grew', 0, 4],
 ['daily', 1, 5],
 ['ever', 75, 4],
 ['great', 31, 5],
 ['strength', 6, 8],
 ['thought', 19, 7],
 ['bright', 2, 6],
 ['bud', 2, 3],
 ['mine', 22, 4],
 ['biondello', 0, 9],
 ['marry', 4, 5],
 ['may', 27, 3],
 ['pray', 18, 4],
 ['patience', 3, 8],
 ['king', 52, 4],
 ['lear', 3, 4],
 ['instant', 1, 7],
 ['common', 4, 6],
 ['maid', 9, 4],
 ['may', 27, 3],
 ['le', 398, 2],
 ['brave', 4, 5],
 ['gentleman', 7, 9],
 ['joiner', 1, 6],
 ['find', 11, 4],
 ['u', 2504, 1],
 ['wax', 2, 3],
 ['owe', 30, 3],
 ['full', 13, 4],
 ['presence', 1, 

In [34]:
# Transformando em dataframe
df_lemmatizer = pd.DataFrame(
  data_list_lem,
  columns = ['Token', 'Ocurrence', 'Length']
)

df_lemmatizer

Unnamed: 0,Token,Ocurrence,Length
0,poor,19,4
1,contempt,4,8
2,claimd,0,6
3,thou,143,4
4,slept,2,5
...,...,...,...
9512,ill,171,3
9513,take,32,4
9514,bloody,15,6
9515,back,11,4


In [35]:
token_quantity_lem = df_lemmatizer['Token'].count()
mean_length_lem = df_lemmatizer['Length'].mean()
mean_occurence_lem = df_lemmatizer['Ocurrence'].mean()
with open("Shakespeare_Vocabulary_Analysis.txt", "w") as analysis_file:
  analysis_file.write(f'Quantidade de tokens para o lemmatizador: {token_quantity_lem}\n')
  analysis_file.write(f'Tamanho médio dos tokens para o lemmatizador: {mean_length_lem}\n')
  analysis_file.write(f'Ocorrência média de tokens para o lemmatizador: {mean_occurence_lem}\n\n')

### Análise do stemming 01Porter Stemmer

In [36]:
# Obtendo os tokens
stem_01 = Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming01
# Obtendo as ocorrências
token_ocurrences_01 = [text.count(token) for token in stem_01]
# Obtendo os tamanhos de cada token
token_lengths_01 = [len(token) for token in stem_01]

In [37]:
# Passando os dados para o formato de lista
data_list_01 = [[tok, occ, len] for tok, occ, len in zip(stem_01, token_ocurrences_01, token_lengths_01)]

Transformando os dados em dataframe do pandas

In [38]:
# Transformando em dataframe
df_stemming_01 = pd.DataFrame(
  data_list_01,
  columns = ['Token', 'Ocurrence', 'Length']
)
df_stemming_01.to_csv("Shakespeare_Vocabulary_Porter.csv", index=False)

df_stemming_01

Unnamed: 0,Token,Ocurrence,Length
0,poor,19,4
1,contempt,4,8
2,claimd,0,6
3,thou,143,4
4,slept,2,5
...,...,...,...
9512,ill,171,3
9513,take,32,4
9514,bloodi,0,6
9515,back,11,4


In [39]:
token_quantity_01 = df_stemming_01['Token'].count()
mean_length_01 = df_stemming_01['Length'].mean()
mean_occurence_01 = df_stemming_01['Ocurrence'].mean()
with open("Shakespeare_Vocabulary_Analysis.txt", "a") as analysis_file:
  analysis_file.write(f'\nQuantidade de tokens para o stemming 01: {token_quantity_01}')
  analysis_file.write(f'\nTamanho médio dos tokens para o stemming 01: {mean_length_01}')
  analysis_file.write(f'\nOcorrência média de tokens para o stemming 01: {mean_occurence_01}\n\n')

### Análise dos dados para o Stemming 02 Snowball Stemmer

In [40]:
# Obtendo os tokens
stem_02 = Shakespeare_Normalized_Tokenized_StopWord_Lemmatized_Stemming02
# Obtendo as ocorrências
token_ocurrences_02 = [text.count(token) for token in stem_02]
# Obtendo os tamanhos de cada token
token_lengths_02 = [len(token) for token in stem_02]

In [41]:
# Passando os dados para o formato de lista
data_list_02 = [[tok, occ, len] for tok, occ, len in zip(stem_02, token_ocurrences_02, token_lengths_02)]

In [42]:
# Transformando em dataframe
df_stemming_02 = pd.DataFrame(
  data_list_02,
  columns = ['Token', 'Ocurrence', 'Length']
)
df_stemming_02.to_csv("Shakespeare_Vocabulary_Snowball.csv", index=False)

In [43]:
df_stemming_02

Unnamed: 0,Token,Ocurrence,Length
0,poor,19,4
1,contempt,4,8
2,claimd,0,6
3,thou,143,4
4,slept,2,5
...,...,...,...
9512,ill,171,3
9513,take,32,4
9514,bloodi,0,6
9515,back,11,4


In [44]:
token_quantity_02 = df_stemming_02['Token'].count()
mean_length_02 = df_stemming_02['Length'].mean()
mean_occurence_02 = df_stemming_02['Ocurrence'].mean()

with open("Shakespeare_Vocabulary_Analysis.txt", "a") as analysis_file:
  analysis_file.write(f'\nQuantidade de tokens para o stemming 02: {token_quantity_02}')
  analysis_file.write(f'\nTamanho médio dos tokens para o stemming 02: {mean_length_02}')
  analysis_file.write(f'\nOcorrência média de tokens para o stemming 02: {mean_occurence_02}\n\n')