### Embeddings with XLnet and FastText

In [None]:
#importando bibliotecas
import pandas as pd
import numpy as np
import time #monitorar tempo
import psutil#monitorar cpu
from GPUtil import GPUtil #monitorar GPU
import transformers
import torch
from transformers import XLNetTokenizer, XLNetModel
import numpy as np
import csv
import fasttext
from huggingface_hub import hf_hub_download

### Converting in Embeddings 

#### XLNET

In [None]:

# Configuração do dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando o dispositivo:", device)

# Carregar o modelo XLNet e o tokenizer
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased', from_tf=False).to(device)
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')


# Carregar modelo e tokenizer com fine-tuning
#xlnet_model = XLNetModel.from_pretrained('xlnet_finetuned_user_stories').to(device)
#xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet_finetuned_user_stories')

def split_text(text, tokenizer, max_length=512):
    tokens = tokenizer.encode(text)
    # Se o número de tokens exceder o limite, dividir o texto
    if len(tokens) > max_length:
        # Dividir em partes menores
        return [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return [tokens]

def get_text_embedding(text, model, tokenizer, max_length=512):
    segments = split_text(text, tokenizer, max_length)
    embeddings = []
    for segment in segments:
        inputs = tokenizer.encode_plus(segment, add_special_tokens=True, return_tensors='pt').to(device)
        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states
            embedding = hidden_states[-1][0, 0, :].detach().cpu().numpy()  # Representação do primeiro token
            embeddings.append(embedding)
    # Você pode escolher como combinar as embeddings (por exemplo, média)
    return np.mean(embeddings, axis=0)


# Parâmetros de entrada e saída
input_file = 'input.txt'
output_file = 'output_xlnet.csv'

# Processar todos os dados
with open(output_file, 'w', newline='', encoding='utf-8') as f_out, open(input_file, 'r', encoding='utf-8') as f_in:
    writer = csv.writer(f_out)
    for line in f_in:
        text = line.strip()
        text = ''.join(e for e in text if e.isalnum() or e.isspace())
        try:
            embedding = get_text_embedding(text, xlnet_model, xlnet_tokenizer)
            writer.writerow(embedding)
        except Exception as e:
            print(f"Erro ao processar a linha: {text}. Erro: {e}")

print("Conversão completa! Os embeddings foram salvos em", output_file)


#### FastText

In [None]:
#Gerar embeddings apenas para as linhas necessárias


#Carregar o modelo FastText
#model_path = hf_hub_download(repo_id="facebook/fasttext-pt-vectors", filename="model.bin")
model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin")
ft_model = fasttext.load_model(model_path)

#Função para obter embeddings de um texto
def get_text_embedding(text, model, embedding_dim):
    words = text.split()
    embedding = np.zeros((embedding_dim,))
    for word in words:
        embedding += model.get_word_vector(word)
    return embedding
def gerarEmbeddings(input_file):
    # Parâmetros
    output_file = 'output_fasttext.csv'  #Arquivo de saída para os embeddings
    embedding_dim = ft_model.get_dimension()
    chunk_size = 1000 
    num_labels = len(labels)

    #Processar os dados em chunks
    processed_rows = 0
    with open(output_file, 'w', newline='', encoding='utf-8') as f_out, open(input_file, 'r', encoding='utf-8') as f_in:
        writer = csv.writer(f_out)
        for line in f_in:
            if processed_rows >= num_labels:
                break
            text = line.strip()
            embedding = get_text_embedding(text, ft_model, embedding_dim)
            writer.writerow(embedding)
            processed_rows += 1

    print("Conversão completa! Os embeddings foram salvos em", output_file)

gerarEmbeddings(input_file)