In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from data_preprocessing import preprocess_dataset, load_config
from model import Perceptron

In [None]:
config = load_config()
df = pd.read_csv(config["paths"]["preprocessed_data"])
df.head()

In [None]:
df["target"].value_counts()

In [None]:
df.sample(5)

In [None]:
vectorizer = CountVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(df["text"].values).toarray()
y = df["target"].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=config["model"]["random_seed"],
    stratify=y
)

In [None]:
model = Perceptron()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

acc, cm, report

In [None]:
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X_test)

plt.figure(figsize=(7,5))
plt.scatter(X_reduced[:,0], X_reduced[:,1], c=y_pred, cmap="coolwarm", alpha=0.6)
plt.title("Predicciones del Perceptrón en 2D (PCA)")
plt.show()

In [None]:
def top_words(weights, vocab, k=15):
    """
    Retorna las palabras con pesos más positivos y más negativos.
    """
    sorted_idx = np.argsort(weights)
    neg_idx = sorted_idx[:k]
    pos_idx = sorted_idx[-k:]
    return vocab[pos_idx], vocab[neg_idx]

vocab = vectorizer.get_feature_names_out()
pos_words, neg_words = top_words(model.w, vocab)

pos_words, neg_words

In [None]:
plt.figure(figsize=(10,4))
plt.bar(range(len(model.w[:50])), model.w[:50])
plt.title("Primeros 50 pesos del modelo")
plt.show()