In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

train = pd.read_csv('train.csv', sep= ';', encoding= 'latin1', dtype={'': 'int', 'title': 'string', 'text': 'string', 'label': 'int'})
val = pd.read_csv('evaluation.csv', sep= ';', encoding= 'latin1', dtype={'': 'int', 'title': 'string', 'text': 'string', 'label': 'int'})
test = pd.read_csv('test.csv', sep= ';', encoding= 'latin1', dtype={'': 'int', 'title': 'string', 'text': 'string', 'label': 'int'})


Función de preprocesado (sin mezclar splits)

In [9]:
def recortar_texto(texto, limite=1000):
    palabras = str(texto).split()
    if len(palabras) > limite:
        palabras = palabras[:limite]
    return " ".join(palabras)

def preprocesar(df):
    # 1) Eliminar columna de índice si existe
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    
    # 2) Eliminar duplicados dentro de ese split
    df = df.drop_duplicates(subset='text', keep='first')
    
    # 3) Filtrar por mínimo de palabras
    num_palabras = df["text"].apply(lambda x: len(str(x).split()))
    df = df[num_palabras >= 100].reset_index(drop=True)
    
    # 4) Recortar textos muy largos a 1000 palabras
    df["text"] = df["text"].apply(recortar_texto)
    
    return df


In [10]:
train = train.drop(index=0).reset_index(drop=True)
train = preprocesar(train)
val   = preprocesar(val)
test  = preprocesar(test)

print(train.shape, val.shape, test.shape)


(21183, 3) (7030, 3) (7028, 3)


# Modelado de un algoritmo de clasificación

Separar características y etiquetas

In [11]:
X_train = train.drop(columns=['label'])
y_train = train['label']

X_val = val.drop(columns=['label'])
y_val = val['label']

X_test = test.drop(columns=['label'])
y_test = test['label']

# Comprobar tamaños
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)


X_train: (21183, 2) y_train: (21183,)
X_val: (7030, 2) y_val: (7030,)
X_test: (7028, 2) y_test: (7028,)


In [12]:
# Usamos solo 'text' como entrada de texto
X_train_text = X_train["text"]
X_val_text   = X_val["text"]
X_test_text  = X_test["text"]

# Vectorizador TF-IDF (solo se entrena con TRAIN para no filtrar info del test)
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2), lowercase=True)

X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_val_tfidf   = vectorizer.transform(X_val_text)
X_test_tfidf  = vectorizer.transform(X_test_text)

# Modelo
clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
clf.fit(X_train_tfidf, y_train)

# Evaluación
y_val_pred  = clf.predict(X_val_tfidf)
y_test_pred = clf.predict(X_test_tfidf)

print("=== Validación ===")
print("Accuracy val:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))

print("=== Test ===")
print("Accuracy test:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


=== Validación ===
Accuracy val: 0.9806543385490754
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3456
           1       0.99      0.97      0.98      3574

    accuracy                           0.98      7030
   macro avg       0.98      0.98      0.98      7030
weighted avg       0.98      0.98      0.98      7030

=== Test ===
Accuracy test: 0.984490608992601
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3434
           1       0.99      0.98      0.98      3594

    accuracy                           0.98      7028
   macro avg       0.98      0.98      0.98      7028
weighted avg       0.98      0.98      0.98      7028

