In [63]:
import pandas as pd
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gustavohroos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gustavohroos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gustavohroos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gustavohroos/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [64]:
dados = pd.read_csv('Lorde.tsv', sep='\t')
dados_limpos = dados.drop(columns=['Unnamed: 0'])

In [65]:
# Pré-processamento
def preparar_texto(texto):
    texto = re.sub(r'[^a-zA-Z\s]', '', texto).lower()
    
    tokens = word_tokenize(texto)
    
    tokens_limpos = [palavra for palavra in tokens if palavra not in stopwords.words('english')]
    
    lemmatizer = WordNetLemmatizer()
    
    def obter_pos_tag(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    tokens_lematizados = [lemmatizer.lemmatize(w, obter_pos_tag(w)) for w in tokens_limpos]
    
    return tokens_lematizados

dados_limpos['Texto_Preparado'] = dados_limpos['Corpus'].apply(preparar_texto)
dados_limpos[['Corpus', 'Texto_Preparado']].head()


Unnamed: 0,Corpus,Texto_Preparado
0,I'm a liability I'm a liability Much for me Yo...,"[im, liability, im, liability, much, youre, li..."
1,"You asked if I was feeling it, I’m psycho high...","[ask, feel, im, psycho, high, know, wont, reme..."
2,(Go back and tell it) Please could you be tend...,"[go, back, tell, please, could, tender, sit, c..."
3,"Night, midnight, lose my mind Night, midnight,...","[night, midnight, lose, mind, night, midnight,..."
4,"In my head, I play a supercut of us All the ma...","[head, play, supercut, u, magic, give, love, l..."


In [66]:
sentencas = dados_limpos['Texto_Preparado'].tolist()

# Treinando o modelo

modelo_word2vec = Word2Vec(sentences=sentencas, vector_size=100, window=5, min_count=1, workers=4)

In [67]:
# Mostrando as palavras mais similares a palavra feeling
palavras_similares = modelo_word2vec.wv.most_similar('feeling', topn=10)

palavras_similares

[('though', 0.3209531307220459),
 ('different', 0.30907854437828064),
 ('psycho', 0.2959253489971161),
 ('well', 0.29396384954452515),
 ('tie', 0.286493718624115),
 ('send', 0.2776780128479004),
 ('cadillacs', 0.27569177746772766),
 ('perfect', 0.26952219009399414),
 ('else', 0.26385262608528137),
 ('grocery', 0.2627700865268707)]

In [68]:
# Vetor da palavra feeling
vector = modelo_word2vec.wv['feeling']

vector


array([-0.00673199, -0.00689098, -0.00925254,  0.00349097, -0.00167444,
        0.00747178, -0.00784034,  0.00770493,  0.00914396,  0.00296698,
       -0.00655855,  0.00407521,  0.00661385, -0.007174  ,  0.00433556,
       -0.00256349,  0.00234737, -0.01017085,  0.00057437, -0.00867194,
       -0.00627812, -0.00465405, -0.00872754, -0.01012931, -0.00670051,
       -0.00346477,  0.00072443, -0.01140289, -0.00417207, -0.00471279,
        0.00622272,  0.00494807, -0.00350561,  0.00790052, -0.00857536,
        0.01076558, -0.00182819, -0.00069331,  0.0011879 ,  0.00431898,
       -0.00390576, -0.00238252,  0.00453693,  0.0005084 ,  0.0015834 ,
        0.00868869, -0.00955313, -0.00628194, -0.00525934, -0.00735084,
       -0.00642885, -0.00161124, -0.00074687,  0.00646724,  0.00832964,
        0.00058503,  0.00128625, -0.00871854,  0.0015272 , -0.00615092,
        0.00693059,  0.00099018,  0.00483957, -0.00736168,  0.00120669,
       -0.00879177, -0.00717424, -0.0009309 , -0.00727994, -0.00

In [69]:
# Calculando a similaridade entre duas palavras
similaridade = modelo_word2vec.wv.similarity('love', 'heart')

# Mostrando a similaridade entre "love" e "heart"
similaridade


0.65033346