In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Empezamos con 5000 datos
dataset = load_dataset("ag_news", split="train[:5000]")

# Ejemplo
print("Ejemplo de datos cargados:")
print(dataset[0])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 120000/120000 [00:00<00:00, 656223.00 examples/s]
Generating test split: 100%|██████████| 7600/7600 [00:00<?, ? examples/s]

Ejemplo de datos cargados:
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}





In [None]:
# Estadísticas de los datos
labels = dataset.features["label"].names
print("\nEtiquetas de los datos:", labels)


Etiquetas de los datos: ['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
# Conteo de cada etiqueta
from collections import Counter
counts = Counter([labels[x["label"]] for x in dataset])
print("\nConteo de cada etiqueta:", counts) 


Conteo de cada etiqueta: Counter({'Sci/Tech': 1497, 'Business': 1236, 'World': 1235, 'Sports': 1032})


## Preprocesamiento
---

In [7]:
import nltk
import re
from nltk.corpus import stopwords

In [8]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guisa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

In [10]:
# Preprocesamos los datos
text = [preprocess(example["text"]) for example in dataset]

print("\nEjemplo de texto preprocesado:")
print(text[0])


Ejemplo de texto preprocesado:
['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'shortsellers', 'wall', 'streets', 'dwindlingband', 'ultracynics', 'seeing', 'green']


## TF-IDF
---

[ ] Code provisional TODO

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# Convertimos los token preprocesados a texto limpio
clean_text = [" ".join(t) for t in text]

# Vectorizacion TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X_tfidf = vectorizer.fit_transform(clean_text)

print("\nDimensiones de la matriz TF-IDF:", X_tfidf.shape)
print("Primeros 10 terminos del vocabulario:")
print(list(vectorizer.vocabulary_.keys())[:10])


Dimensiones de la matriz TF-IDF: (5000, 3000)
Primeros 10 terminos del vocabulario:
['wall', 'st', 'back', 'black', 'reuters', 'streets', 'seeing', 'green', 'looks', 'toward']


### Visualizción términos más frecuentes

In [18]:
import numpy as np
import pandas as pd

# Calcular TF-IDF promedio por término
mean_tfidf = np.asarray(X_tfidf.mean(axis=0)).flatten()
terms = vectorizer.get_feature_names_out()

# Crear tabla y ordenar
df_tfidf = pd.DataFrame({"term": terms, "avg_tfidf": mean_tfidf})
top_terms = df_tfidf.sort_values(by="avg_tfidf", ascending=False).head(20)
print(top_terms)

           term  avg_tfidf
2207    reuters   0.023289
128          ap   0.022766
2815         us   0.020208
1739        new   0.019059
2271       said   0.015827
1085     google   0.012393
177      athens   0.011755
2767    tuesday   0.011667
1786        oil   0.011651
969       first   0.010992
2907  wednesday   0.010756
1256        inc   0.009802
530     company   0.009507
2004     prices   0.009407
2782        two   0.009384
2698   thursday   0.009139
1790    olympic   0.009083
2992       york   0.008184
1078       gold   0.008090
2964      world   0.008046
