In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Empezamos con 5000 datos
dataset = load_dataset("ag_news", split="train[:5000]")

# Ejemplo
print("Ejemplo de datos cargados:")
print(dataset[0])

Ejemplo de datos cargados:
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [3]:
# Estadísticas de los datos
labels = dataset.features["label"].names
print("\nEtiquetas de los datos:", labels)


Etiquetas de los datos: ['World', 'Sports', 'Business', 'Sci/Tech']


In [4]:
# Conteo de cada etiqueta
from collections import Counter
counts = Counter([labels[x["label"]] for x in dataset])
print("\nConteo de cada etiqueta:", counts) 


Conteo de cada etiqueta: Counter({'Sci/Tech': 1497, 'Business': 1236, 'World': 1235, 'Sports': 1032})


## Preprocesamiento
---

In [5]:
import nltk
import re
from nltk.corpus import stopwords

In [6]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

In [8]:
# Preprocesamos los datos
text = [preprocess(example["text"]) for example in dataset]

print("\nEjemplo de texto preprocesado:")
print(text[0])


Ejemplo de texto preprocesado:
['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'shortsellers', 'wall', 'streets', 'dwindlingband', 'ultracynics', 'seeing', 'green']


## TF-IDF
---

[ ] Code provisional TODO

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# Convertimos los token preprocesados a texto limpio
clean_text = [" ".join(t) for t in text]

# Vectorizacion TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X_tfidf = vectorizer.fit_transform(clean_text)

print("\nDimensiones de la matriz TF-IDF:", X_tfidf.shape)
print("Primeros 10 terminos del vocabulario:")
print(list(vectorizer.vocabulary_.keys())[:10])


Dimensiones de la matriz TF-IDF: (5000, 3000)
Primeros 10 terminos del vocabulario:
['wall', 'st', 'back', 'black', 'reuters', 'streets', 'seeing', 'green', 'looks', 'toward']


### Visualizción términos más frecuentes

In [11]:
import numpy as np
import pandas as pd

# Calcular TF-IDF promedio por término
mean_tfidf = np.asarray(X_tfidf.mean(axis=0)).flatten()
terms = vectorizer.get_feature_names_out()

# Crear tabla y ordenar
df_tfidf = pd.DataFrame({"term": terms, "avg_tfidf": mean_tfidf})
top_terms = df_tfidf.sort_values(by="avg_tfidf", ascending=False).head(20)
print(top_terms)

           term  avg_tfidf
2207    reuters   0.023289
128          ap   0.022766
2815         us   0.020208
1739        new   0.019059
2271       said   0.015827
1085     google   0.012393
177      athens   0.011755
2767    tuesday   0.011667
1786        oil   0.011651
969       first   0.010992
2907  wednesday   0.010756
1256        inc   0.009802
530     company   0.009507
2004     prices   0.009407
2782        two   0.009384
2698   thursday   0.009139
1790    olympic   0.009083
2992       york   0.008184
1078       gold   0.008090
2964      world   0.008046


### Matriz de coocurrencias y PMI


In [12]:
from collections import Counter, defaultdict
import itertools

- Creamos la matriz de coocurrencisa para hallar aquellas palabras que aparezcan juntas en un mismo contexto.

In [13]:
window_size = 4
cooccurrence_counts = defaultdict(Counter)

for tokens in text:
    for i, token in enumerate(tokens):
        window = tokens[max(i - window_size, 0): i] + tokens[i+1: i+1+window_size]
        for neighbor in window:
            cooccurrence_counts[token][neighbor] += 1

In [None]:
import math

# Conteos totales
word_counts = Counter(itertools.chain(*text))
total_words = sum(word_counts.values())

def compute_pmi(word, context):
    pw = word_counts[word] / total_words
    pc = word_counts[context] / total_words
    pwc = cooccurrence_counts[word][context] / total_words
    if pwc == 0:
        return 0
    return math.log2(pwc / (pw * pc))

# Tabla PMI ejemplo
sample_words = ["president", "game", "company", "computer"]
for word in sample_words:
    print(f"\nPalabras relacionadas con '{word}':")
    related = [(w, compute_pmi(word, w)) for w in cooccurrence_counts[word]]
    related = sorted(related, key=lambda x: -x[1])[:10]
    for neighbor, score in related:
        print(f"  {neighbor}: {score:.2f}")



Palabras relacionadas con 'president':
  gleam: 9.57
  unitedstatesi: 9.57
  unelectable: 9.57
  corporationand: 9.57
  nominees: 9.57
  anexecutive: 9.57
  ingrained: 9.57
  distrustof: 9.57
  concentrated: 9.57
  referendumthat: 9.57

Palabras relacionadas con 'game':
  ballpark: 11.20
  whotold: 10.20
  microgames: 10.20
  catwoman: 10.20
  mosquitos: 10.20
  gamecopying: 10.20
  fiftyfive: 10.20
  dislocating: 10.20
  forthcoming: 10.20
  convergence: 10.20

Palabras relacionadas con 'company':
  exemployees: 9.50
  marlboroughbased: 9.50
  ecm: 9.50
  anoil: 9.50
  picnic: 9.50
  sidestepped: 8.50
  shuts: 8.50
  clockwork: 8.50
  wrests: 8.50
  defunct: 8.50

Palabras relacionadas con 'computer':
  unfortunately: 11.19
  makerdell: 10.19
  locks: 10.19
  toggle: 10.19
  problemsolving: 10.19
  teambuilding: 10.19
  treats: 10.19
  contamination: 10.19
  resellers: 10.19
  interrupted: 10.19
