### 03- Gerar próprias embeddings 

In [None]:
#NLTK
from nltk.stem import RSLPStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('rslp')
stemmer_pt = RSLPStemmer()
stemmer_en = PorterStemmer()
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize

#Python
import string

# Gensim
import gensim
from gensim.models import KeyedVectors

#Other libraries
from tqdm import tqdm
import numpy as np
import scipy
import joblib
from abc import ABCMeta, abstractmethod
import re
import json
import pandas as pd
import sys

In [2]:
import time

In [3]:
#Loading the stopwords
stopwords_pt = set(stopwords.words('portuguese'))
stopwords_en = set(stopwords.words('english'))

In [14]:
#read data
data = pd.read_csv('data/raw/rock_mar21.csv')

In [15]:
data

Unnamed: 0,lang,genero,exib,titulo,artista,letras
0,en,Heavy Metal,132631,Black Diamond,Stratovarius,Again I see you standing there watching me.You...
1,en,Heavy Metal,118508,Hunting High And Low,Stratovarius,I feel the wind in my hair.And it's whispering...
2,en,Heavy Metal,76327,Destiny,Stratovarius,The times are changing so fast.I wonder how lo...
3,en,Heavy Metal,25696,Unbreakable,Stratovarius,Worthless without time.We follow how it flies ...
4,en,Heavy Metal,46118,Eagleheart,Stratovarius,All through the night he is lying awake.Wond'r...
...,...,...,...,...,...,...
13018,en,Soft Rock,13882,Graceland,Paul Simon,The Mississippi delta.Was shining like a natio...
13019,en,Soft Rock,1456,Cool Papa Bell,Paul Simon,It turns out to be.A great thing for me.I don'...
13020,en,Soft Rock,14645,The Obvious Child,Paul Simon,Well I'm accustomed to a smooth ride.Or maybe ...
13021,en,Soft Rock,10755,The Boxer,Paul Simon,I am just a poor boy though my story's seldom ...


In [6]:
class TextPreprocessor(object): 
    
    def __init__(self, language='en', remove_stopwords=True, remove_punctuation=True, 
                 convert_numbers = False, remove_numbers = True, simplification=True, 
                 simplification_type='lemmatization', lower_case = True): 
        self.language = language
        self.remove_stopwords = remove_stopwords
        self.remove_punctuation = remove_punctuation
        self.convert_numbers = convert_numbers
        self.remove_numbers = remove_numbers
        self.simplification = simplification
        self.simplification_type = simplification_type 
        self.lower_case = lower_case


    # Complete function to standardize the text
    def text_cleaner(self, text): 
        new_text = ''
        stopwords = None 

        if self.language == 'en':
            stopwords = stopwords_en 
        else:
            stopwords = stopwords_pt

        if self.lower_case == True: 
            text = text.lower()

        tokens = nltk.word_tokenize(text)
        
        if self.remove_stopwords == True:
            new_tokens = []
            for token in tokens: 
                if token in stopwords:
                    continue 
                else: 
                    new_tokens.append(token)
            tokens = new_tokens 

        if self.remove_punctuation == True: 
            new_tokens = []
            for token in tokens: 
                if token in string.punctuation:
                    continue 
                else: 
                    new_tokens.append(token)
            tokens = new_tokens 
        
        if self.remove_numbers == True:
            new_tokens = []
            for token in tokens: 
                if token.isnumeric():
                    continue
                new_tokens.append(token)
            tokens = new_tokens 
        
        if self.convert_numbers == True: 
            new_tokens = []
            for token in tokens: 
                if token.isnumeric():
                    new_tokens.append("0"*len(token))
                else: 
                    new_tokens.append(token)
            tokens = new_tokens 

        if self.simplification == True: 
            new_tokens = []
            if self.language == 'en': 
                if self.simplification_type  == 'lemmatization':
                    for token in tokens: 
                        new_tokens.append(lemmatizer.lemmatize(token))
                elif self.simplification_type  == 'stemming':
                    for token in tokens: 
                        new_tokens.append(stemmer_en.stem(token))
                else: 
                    raise ValueError('Unsuported language. Please, use language = {"pt","en"}.')
            elif self.language == 'pt':
                for token in tokens: 
                        new_tokens.append(stemmer_en.stem(token))
            else: 
                raise ValueError('Unsuported language. Please, use language = {"pt","en"}.')
            tokens = new_tokens

        return ' '.join(tokens).strip()

In [7]:
def remove_punctuation(text):  
    # re.sub(replace_expression, replace_string, target)
    new_text = re.sub(r"\.|,|;|:|-|’|!|\?|´|`|^|'", " ", text)
    new_text = new_text.strip()
    return new_text

In [8]:
#Just a simple tokenizer
def tokenizer(text):
    text = text.lower()
    lista_alfanumerica = []

    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue 
        if token in stopwords_en: 
            continue
        if token.isnumeric():
            token = "0"*len(token)

        lista_alfanumerica.append(token)
    return lista_alfanumerica

In [16]:
text_preprocessor = TextPreprocessor()

texts = data['letras']

texts = texts.apply(text_preprocessor.text_cleaner)
texts = texts.apply(remove_punctuation)

In [17]:
def build_model(model,window_size,dim_size,max_epochs,texts):
    language_model = None
    sg = 0
    alpha = 0.025
    min_alpha = 0.0001
    min_count = 5
    
    list_tokens_texts = texts.apply(tokenizer)
    
    output = 'custom_rock_{}_{}_{}_{}d.txt'.format(model,max_epochs,window_size,dim_size)
    
    print('Bulding '+output+' model')
    
    if model == 'cbow' : 
        language_model = gensim.models.Word2Vec(list_tokens_texts,sg=sg, min_count=min_count, window=window_size, size=dim_size,
                   workers= 4, iter=max_epochs, alpha = alpha, min_alpha = min_alpha)
    elif model == 'sg': 
        language_model = gensim.models.Word2Vec(list_tokens_texts,sg=sg, min_count=min_count, window=window_size, size=dim_size,
                   workers= 4, iter=max_epochs, alpha = alpha, min_alpha = min_alpha)
        sg = 1
    elif model == 'fasttext': 
        language_model = gensim.models.FastText(list_tokens_texts,sg=sg, min_count=min_count, window=window_size, size=dim_size,
                   workers= 4, iter=max_epochs, alpha = alpha, min_alpha = min_alpha)
        sg = 1
    else: 
        raise ValueError('Unsuported language model. Please, use language model = {"cbow","sg","fasttext"}.')
    print()
    
    #language_model.save(fname=output)
    language_model.wv.save_word2vec_format(fname=output)

In [19]:
#Params
models = ['sg','cbow','fasttext'] 
window_sizes = [5, 8, 10]
num_dimensions = [25, 50, 100, 300]
num_max_epochs = [1, 3, 5, 50]

In [20]:
for model in models:
    for window_size in window_sizes:
        for dim_size in num_dimensions:
            for max_epochs in num_max_epochs:
                 build_model(model,window_size,dim_size,max_epochs,texts)

Bulding custom_rock_sg_1_5_25d.txt model

Bulding custom_rock_sg_3_5_25d.txt model

Bulding custom_rock_sg_5_5_25d.txt model

Bulding custom_rock_sg_50_5_25d.txt model

Bulding custom_rock_sg_1_5_50d.txt model

Bulding custom_rock_sg_3_5_50d.txt model

Bulding custom_rock_sg_5_5_50d.txt model

Bulding custom_rock_sg_50_5_50d.txt model

Bulding custom_rock_sg_1_5_100d.txt model

Bulding custom_rock_sg_3_5_100d.txt model

Bulding custom_rock_sg_5_5_100d.txt model

Bulding custom_rock_sg_50_5_100d.txt model

Bulding custom_rock_sg_1_5_300d.txt model

Bulding custom_rock_sg_3_5_300d.txt model

Bulding custom_rock_sg_5_5_300d.txt model

Bulding custom_rock_sg_50_5_300d.txt model

Bulding custom_rock_sg_1_8_25d.txt model

Bulding custom_rock_sg_3_8_25d.txt model

Bulding custom_rock_sg_5_8_25d.txt model

Bulding custom_rock_sg_50_8_25d.txt model

Bulding custom_rock_sg_1_8_50d.txt model

Bulding custom_rock_sg_3_8_50d.txt model

Bulding custom_rock_sg_5_8_50d.txt model

Bulding custom_rock_s

Miscelaneuos

In [None]:
train_sentences = list(sentences.progress_apply(str.split).values)

In [None]:
# Total number of vocab in our custom word embedding

len(model.wv.vocab.keys())

In [None]:
# Find words with similar meaning to 'iphone'
model.wv.most_similar('iphone')

In [None]:
language_model.wv.save_word2vec_format('custom_glove_100d.txt')