In [1]:
import pandas as pd
from pathlib import Path
import os
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
import utils
from sentence_transformers import SentenceTransformer
import torch
warnings.filterwarnings(action='ignore', category=UserWarning)

  from tqdm.autonotebook import tqdm, trange


In [2]:
#Se leen los datos y se seleccionan las variables que presentan una diferencia
# entre los individuos que presentan estres y los que no
data_path = Path(os.getcwd()).parent / "data"
silver_path = data_path / "silver" 
gold_path = data_path /"gold"
df = pd.read_csv(os.path.join(silver_path,"dreadditCleanTrain.csv"), usecols= ["text",
                                        "clean_text",
                                        'avg_word_len', 'lex_diversity',
                                        "label"])

df_test = pd.read_csv(os.path.join(silver_path,"dreadditCleanTest.csv"), usecols= ["text",
                                        "clean_text",
                                        'avg_word_len', 'lex_diversity',
                                        "label"])

In [3]:
# Cargar el modelo de SpaCy
nlp = spacy.load('en_core_web_sm')

In [None]:
# se almacenan todos los textos en una lista
#texts = df['clean_text'].tolist()

# Procesar los textos en lotes
#docs = nlp.pipe(texts)

In [4]:
#Features manuales 
df_manual = df[["avg_word_len","lex_diversity", "label"]]
#df_manual.to_csv(os.path.join(gold_path,"manual_features_train.csv"),index=False)

df_test_manual = df_test[["avg_word_len","lex_diversity", "label"]]
df_test_manual.to_csv(os.path.join(gold_path,"manual_features_test.csv"),index=False)

In [16]:
#Features con TF-IDF
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 3), 
                        stop_words='english')
tfidf = tfidf.fit(df["clean_text"])
features = tfidf.transform(df["clean_text"]).toarray()
df_tfidf = pd.DataFrame(features)
df_tfidf["label"] = df["label"]

#df_tfidf.to_csv(os.path.join(gold_path,"tfidf_features_train.csv"),index=False)

features_test = tfidf.transform(df_test["clean_text"]).toarray()
df_test_tfidf = pd.DataFrame(features_test)
df_test_tfidf["label"] = df_test["label"]

df_test_tfidf.to_csv(os.path.join(gold_path,"tfidf_features_test.csv"),index=False)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [7]:
#Features con all-mpnet-base-v2

# Se carga el modelo
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model = model.to(device)
#Se generan los embeddings
embeddings = model.encode(df.text)
df_mpnet = pd.DataFrame(embeddings)
df_mpnet["label"] = df["label"]

#df_mpnet.to_csv(os.path.join(gold_path,"mpnet_features_train.csv"),index=False)

embeddings_test = model.encode(df_test.text)
df_test_mpnet = pd.DataFrame(embeddings_test)
df_test_mpnet["label"] = df_test["label"]

df_test_mpnet.to_csv(os.path.join(gold_path,"mpnet_features_test.csv"),index=False)



In [8]:
#Features con distilroberta

# Se carga el modelo
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')
model = model.to(device)

#Se generan los embeddings
embeddings = model.encode(df.text)
df_distilroberta = pd.DataFrame(embeddings)
df_distilroberta["label"] = df["label"]

#df_distilroberta.to_csv(os.path.join(gold_path,"distilroberta_features_train.csv"),index=False)

embeddings_test = model.encode(df_test.text)
df_test_distilroberta = pd.DataFrame(embeddings_test)
df_test_distilroberta["label"] = df_test["label"]

df_test_distilroberta.to_csv(os.path.join(gold_path,"distilroberta_features_test.csv"),index=False)

