# Estudo NLTK e Gensim

Notebook baseado no estudo do Homero

In [1]:
import string
import numpy as np
import pandas as pd

# NLTK
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

# Gemsin
import gensim
from gensim.models import Word2Vec

# SkLearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# XBGBoost
import xgboost as xgb
import tensorflow as tf

In [2]:
print("nltk.__version__", nltk.__version__)
print("sklearn.__version__", sklearn.__version__)
print("gensim.__version__", gensim.__version__)
print("xgboost.__version__", xgb.__version__)
print("tensorflow.__version__", tf.__version__)

nltk.__version__ 3.7
sklearn.__version__ 1.0.2
gensim.__version__ 4.3.0
xgboost.__version__ 1.7.5
tensorflow.__version__ 2.12.0


In [3]:
# download das bases do NLTK que serão utilizadas
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("words")
nltk.download('movie_reviews')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [4]:
stemmer = SnowballStemmer("english")

In [5]:
# Lista para armazenar documentos e labels
documents = []

# Iterar sobre as categorias
for category in movie_reviews.categories():
  for fileid in movie_reviews.fileids(category):
    # Obter o texto bruto do arquivo e a categoria
    doc = movie_reviews.raw(fileid)
    label = category
    # Adicionar o documento e label à lista
    documents.append((doc, label))

In [6]:
def pre_processed_docs(documents, remove_stopwords=True):
    # Pré-processamento
    stop_words = set(stopwords.words('english'))

    processed_docs = []
    processed_sentences = []
  
    for doc, label in documents:
        # Tokenização e conversão para minúsculas
        text = doc.lower()
        text = doc
        tokens = word_tokenize(text)

        # Remoção de pontuação
        tokens = [token for token in tokens if token not in string.punctuation]

        # Remoção de stopwords
        if remove_stopwords:
            tokens = [token for token in tokens if token not in stop_words]

        # Lematização
        # tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Stemmização
        tokens = [stemmer.stem(token) for token in tokens]
    
        # Adicionar documento processado e rótulo à lista
        processed_docs.append((tokens, label))
        processed_sentences.append(tokens)

    return processed_docs, processed_sentences 

In [None]:
processed_docs, processed_sentences = pre_processed_docs(documents)
processed_docs_complete, processed_sentences_complete = pre_processed_docs(documents, False)


In [None]:
labels = [label for doc, label in processed_docs]
print(np.unique(labels))

In [None]:
print(labels.count("neg"), labels.count("pos"))

In [None]:
# Análise documentos quando stopwords são removidas
tokens = [doc for doc, label in processed_docs]
tokens = [token for token in tokens for token in token]
frequencia = nltk.FreqDist(tokens)

print("Total de tokens:", len(tokens))
print("Total de tokens únicos:", len(set(tokens)))
print("Tokens mais comuns:", frequencia.most_common(50))


frequencia.plot(30)

In [None]:
# Análise documentos quando stopwords não são removidas
tokens = [doc for doc, label in processed_docs_complete]
tokens = [token for token in tokens for token in token]
frequencia = nltk.FreqDist(tokens)

print("Total de tokens:", len(tokens))
print("Total de tokens únicos:", len(set(tokens)))
print("Tokens mais comuns:", frequencia.most_common(50))


frequencia.plot(30)

## Utilização do Gensim para processamento dos dados

Pelo que eu estudei, o melhor é passar para o Word2Vec as frases inteiras ao invés de palavras.

In [None]:
model = Word2Vec(processed_sentences)
model_complete = Word2Vec(processed_sentences_complete)

In [None]:
# gravando o modelo
# desta maneira, posso acelerar o processamento, se necessário
model.save("model.bin")
model_complete.save("model_complete.bin")

Eu reparei que muitas palavras terminadas na letra "e" estão sem a letra "e".
Veja movie ou storie logo na primeira linha.
Preciso entender qual transformação está retirando o "e" final

In [None]:
# summarize vocabulary
words = list(model.wv.index_to_key)
print(words)

In [None]:
# access vector for one word
model.wv['movi']

In [None]:
# verifica a similaridade
model.wv.similarity("film" , "movi")

In [None]:
model.wv.most_similar(positive=["film"])

In [None]:
# quantidade mínima de palavras
tamanho_avaliacao = []
for words, label in documents:
    words_in_model = [word for word in words if word in model.wv]
    tamanho_avaliacao.append(len(words_in_model))


In [None]:
int(np.mean(tamanho_avaliacao))

In [None]:
# Convert text to feature vectors using Word2Vec embeddings
def text_to_features(documents, model, feature_size=0):

    # Preparar dados para o classificador
    X = []
    y = []
    
    label_map = {'neg': 0, 'pos': 1}

    for words, label in documents:
        word_vectors = [model.wv[word] for word in words if word in model.wv]
    
        if len(word_vectors) > 0:
            if feature_size == 0:
                document_vector = np.mean(word_vectors, axis=0)
            else:
                if len(word_vectors) > feature_size:
                    #trunca o word_vectors para o tamanho do vector_size
                    document_vector_tmp = word_vectors[:feature_size-1]
                else:
                    document_vector_tmp = word_vectors
                    #adiciona entradas 0 para completar o tamanho do vector_size
                    for i in range(len(word_vectors),feature_size):
                        document_vector_tmp.append([0]*model.wv.vector_size)

                new_shape = (document_vector_tmp[0], document_vector_tmp[1] * document_vector_tmp[2])

                # Reshape the array
                # transforma de 2D em uma lista flat
                document_vector = [item for sublist in document_vector_tmp for item in sublist]
                        
            X.append(document_vector)
            y.append(label_map[label])

    return X, y


In [None]:
def treina_avalia_modelo(document, model, feature_size=0):
    X , y = text_to_features(processed_docs, model, feature_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
    xgb_classifier = xgb.XGBClassifier()
    xgb_classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = xgb_classifier.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')

    # Print a classification report for more detailed evaluation
    print(classification_report(y_test, y_pred))
    
    return xgb_classifier

In [None]:
print("Avaliando modelo sem stop words e utilizando a média no Word2Vec")
treina_avalia_modelo(processed_docs, model)

In [132]:
print("Avaliando modelo sem stop words e utilizando as primeiras 1582 entradas do Word2Vec como features")
treina_avalia_modelo(processed_docs, model, 10)

Avaliando modelo sem stop words e utilizando as primeiras 1582 entradas do Word2Vec como features


AttributeError: 'list' object has no attribute 'reshape'

In [129]:
import numpy as np

# Example 3D array
original_array = np.array([[[1, 2],
                            [3, 4]],
                           
                           [[5, 6],
                            [7, 8]],
                           
                           [[9, 10],
                            [11, 12]]])

# Get the original shape
original_shape = original_array.shape

# Calculate the new shape by combining the second and third dimensions
new_shape = (original_shape[0], original_shape[1] * original_shape[2])

# Reshape the array
reshaped_array = original_array.reshape(new_shape)

print("Original Shape:")
print(original_shape)
print("Reshaped Array:")
print(reshaped_array)

Original Shape:
(3, 2, 2)
Reshaped Array:
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [130]:
new_shape

(3, 4)