# Data manipulation

### Libraries

In [43]:
# Libraries
import nltk
nltk.download() 
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#Required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

nltk.data.find('tokenizers/punkt')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dbora\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dbora\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dbora\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


FileSystemPathPointer('C:\\Users\\dbora\\nltk_data\\tokenizers\\punkt')

In [44]:
import nltk
from nltk.tokenize import word_tokenize

# Texto de exemplo para testar a tokenização
test_text = "Este é um teste simples para verificar se o punkt está funcionando."

# Tentar tokenizar o texto
try:
    tokens = word_tokenize(test_text)
    print("Tokenização bem-sucedida:", tokens)
except LookupError as e:
    print("Erro de Lookup:", str(e))

Tokenização bem-sucedida: ['Este', 'é', 'um', 'teste', 'simples', 'para', 'verificar', 'se', 'o', 'punkt', 'está', 'funcionando', '.']


### Data collection

In [45]:
# List with data
texts = [
    "O gato está no telhado.",
    "A chuva cai sem parar.",
    "Gosto de assistir filmes nos finais de semana.",
    "Ele está estudando para as provas finais."
    ]

### Data processing

In [46]:
# Function to remove stopwords and non-alphabetic characters and return a list of clean words
def clean_text(text):
    """
    Removes stopwords and non-alphabetic characters from the input text and returns a list of clean words.

    Parameters:
    text (str): The input text to be cleaned.

    Returns:
    list: A list of words with stopwords and non-alphabetic characters removed.
    """
    stop_words = set(stopwords.words('portuguese')) 
    word_tokens = word_tokenize(text.lower())
    filtered_words = [word for word in word_tokens if word.isalpha() and word not in stop_words] 
    return filtered_words
    # stop_words = set(stopwords.words('portuguese')) 
    # word_tokens = word_tokenize(text.lower())
    # filtered_words = [word for word in word_tokens if word.isalpha() and word not in stop_words] 
    # return filtered_words

In [47]:
# Function to return a list of texts as str
def clean_texts(texts):
    """
    Cleans a list of texts by removing stopwords and non-alphabetic characters from each text.

    Parameters:
    texts (list of str): A list of texts to be cleaned.

    Returns:
    list of str: A list of cleaned texts as strings.
    """
    return [" ".join(clean_text(text)) for text in texts]
    #return [" ".join(clean_text(text)) for text in texts]

# Apply the function
cleaned_texts = clean_texts(texts)
#cleaned_texts = clean_texts(texts)

print("Cleaned texts:")
print(cleaned_texts)

Cleaned texts:
['gato telhado', 'chuva cai parar', 'gosto assistir filmes finais semana', 'estudando provas finais']


### Lematization

In [42]:
# Function that returns lemmatized words
def lemmatize_words(words):
    """
    Lemmatizes a list of words.

    Parameters:
    words (list of str): A list of words to be lemmatized.

    Returns:
    list of str: A list of lemmatized words.
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]


In [None]:
# Function that applies lemmatization and then joins the lemmatized words back into a str
def lemmatize_texts(texts):
    """
    Lemmatizes a list of texts.

    Parameters:
    texts (list of str): A list of texts to be lemmatized.

    Returns:
    list of str: A list of lemmatized texts.
    """
    return [" ".join(lemmatize_words(text.split())) for text in texts]

# Apply the function
lemmatized_texts = lemmatize_texts(cleaned_texts)

print("Lemmatized texts:")
print(lemmatized_texts)


Lemmatized texts:
['gato telhado', 'chuva cai parar', 'gosto assistir filmes finais semana', 'estudando provas finais']


### Vectorization

TF-IDF

In [16]:
# Function that creates TF-IDF vectors
def create_tfidf_vectors(texts):
    """
    Creates TF-IDF vectors from a list of texts.

    Parameters:
    texts (list of str): A list of texts to be transformed into TF-IDF vectors.

    Returns:
    tuple: A tuple containing the TF-IDF vectors and the TF-IDF vectorizer.
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectors = tfidf_vectorizer.fit_transform(texts)
    return tfidf_vectors, tfidf_vectorizer

# Apply the function
tfidf_vectors, tfidf_vectorizer = create_tfidf_vectors(lemmatized_texts)


NameError: name 'lemmatized_texts' is not defined

In [None]:
# Store vectorized words
feature_names = tfidf_vectorizer.get_feature_names_out()
# Convert the matrix to an array
dense_matrix = tfidf_vectors.toarray()

# Display terms and values
for i, text in enumerate(lemmatized_texts): 
    print(f"\nText {i+1}:")
    for j, value in enumerate(dense_matrix[i]):
        if value > 0:
            print(f"{feature_names[j]}: {value}")


Text 1:
gato: 0.7071067811865476
telhado: 0.7071067811865476

Text 2:
cai: 0.5773502691896257
chuva: 0.5773502691896257
parar: 0.5773502691896257

Text 3:
assistir: 0.4651619335222394
filmes: 0.4651619335222394
finais: 0.3667390112974172
gosto: 0.4651619335222394
semana: 0.4651619335222394

Text 4:
estudando: 0.6176143709756019
finais: 0.48693426407352264
provas: 0.6176143709756019


Bag of Words

In [None]:
# Function that creates Bag of Words vectors
def create_bow_vectors(texts):
    """
    Creates Bag of Words (BoW) vectors from a list of texts.

    Parameters:
    texts (list of str): A list of texts to be transformed into BoW vectors.

    Returns:
    tuple: A tuple containing the BoW vectors and the BoW vectorizer.
    """
    bow_vectorizer = CountVectorizer()
    bow_vectors = bow_vectorizer.fit_transform(texts)
    return bow_vectors, bow_vectorizer

# Apply the function
bow_vectors, bow_vectorizer = create_bow_vectors(lemmatized_texts)


In [None]:
# Display vectorized words
feature_names = bow_vectorizer.get_feature_names_out()
print("Feature names:", feature_names)

Feature names: ['assistir' 'cai' 'chuva' 'estudando' 'filmes' 'finais' 'gato' 'gosto'
 'parar' 'provas' 'semana' 'telhado']


In [None]:
# Display matrix
dense_matrix = bow_vectors.toarray()
print("Dense matrix:")
print(dense_matrix)

Dense matrix:
[[0 0 0 0 0 0 1 0 0 0 0 1]
 [0 1 1 0 0 0 0 0 1 0 0 0]
 [1 0 0 0 1 1 0 1 0 0 1 0]
 [0 0 0 1 0 1 0 0 0 1 0 0]]
