In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    roc_auc_score,
    accuracy_score
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt

np.random.seed(42)

entrenamiento = pd.read_csv('train.csv')
testeo = pd.read_csv('test.csv')
muestra = pd.read_csv('sample_submission.csv')

In [47]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def preprocess_keyword(keyword):
    if isinstance(keyword, str):
        processed_keyword = keyword.replace('%20', '_')
        return processed_keyword.strip().lower()
    return keyword

def preprocess_location(location):
    if isinstance(location, str):
        processed_location = location.replace('%20', '_')
        return processed_location
    return location

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    text = text.replace('!', ' EXCLAMATION ')
    text = text.replace('?', ' QUESTION ')

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = nltk.word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]

    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Split entre entrenamiento y validación


In [48]:
X = entrenamiento.drop(['target'], axis=1)
y = entrenamiento['target']

X_train, X_validation, y_train, y_validation = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

X_train = X_train.copy()
X_validation = X_validation.copy()

X_train['text_orig'] = X_train['text'].astype(str)
X_validation['text_orig'] = X_validation['text'].astype(str)

X_train['text'] = X_train['text'].apply(preprocess_text)
X_validation['text'] = X_validation['text'].apply(preprocess_text)

X_train['location'] = X_train['location'].apply(preprocess_location)
X_validation['location'] = X_validation['location'].apply(preprocess_location)

X_train['keyword'] = X_train['keyword'].apply(preprocess_keyword)
X_validation['keyword'] = X_validation['keyword'].apply(preprocess_keyword)

X_train['text_raw'] = X_train['text'].apply(lambda tokens: ' '.join(tokens))
X_validation['text_raw'] = X_validation['text'].apply(lambda tokens: ' '.join(tokens))

print(f"Tamaño entrenamiento: {X_train.shape}")
print(f"Tamaño validación: {X_validation.shape}")

Tamaño entrenamiento: (6090, 6)
Tamaño validación: (1523, 6)


In [49]:
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [5, 10, 15, 25, 35],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['minkowski'],
    'knn__p': [1, 2]
}

## Búsqueda de hiperparámetros

In [50]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    n_jobs=-1,
    verbose=1
)

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


## Embedding del texto


In [51]:
import os

word_vectors = None
embedding_dim = 50
model_name = 'glove-twitter-50'

try:
    from gensim.downloader import load
    word_vectors = load(model_name)
    embedding_dim = word_vectors.vector_size
    print(f"Embeddings '{model_name}' cargados ({embedding_dim} dimensiones)")
except Exception as exc:
    print(f"No se pudieron cargar embeddings pre-entrenados: {exc}")
    word_vectors = None

def text_to_embedding(tokens, word_vectors, embedding_dim):
    if word_vectors is None:
        return np.zeros(embedding_dim)
    vectors = []
    for word in tokens:
        if word in word_vectors:
            vectors.append(word_vectors[word])
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(embedding_dim)

X_train_embeddings = np.vstack([
    text_to_embedding(tokens, word_vectors, embedding_dim)
    for tokens in X_train['text']
])
X_validation_embeddings = np.vstack([
    text_to_embedding(tokens, word_vectors, embedding_dim)
    for tokens in X_validation['text']
])

print(f"Embeddings entrenamiento: {X_train_embeddings.shape}")

Embeddings 'glove-twitter-50' cargados (50 dimensiones)
Embeddings entrenamiento: (6090, 50)


In [52]:
tfidf = TfidfVectorizer(
    max_features=None,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.90
)
svd = TruncatedSVD(n_components=200, random_state=42)

X_train_tfidf = tfidf.fit_transform(X_train['text_raw'])
X_validation_tfidf = tfidf.transform(X_validation['text_raw'])

X_train_tfidf_svd = svd.fit_transform(X_train_tfidf)
X_validation_tfidf_svd = svd.transform(X_validation_tfidf)

print(f"TF-IDF reducido entrenamiento: {X_train_tfidf_svd.shape}")

TF-IDF reducido entrenamiento: (6090, 200)


## Entrenamiento del modelo


In [53]:
X_train_final = np.hstack([X_train_embeddings])
X_validation_final = np.hstack([ X_validation_embeddings])

X_train_final = np.nan_to_num(X_train_final)
X_validation_final = np.nan_to_num(X_validation_final)

print(f"Matriz final entrenamiento: {X_train_final.shape}")
print(f"Matriz final validación: {X_validation_final.shape}")

grid.fit(X_train_final, y_train)

print('Mejores hiperparámetros:')
print(grid.best_params_)
print(f"Mejor F1 (CV): {grid.best_score_:.4f}")

Matriz final entrenamiento: (6090, 50)
Matriz final validación: (1523, 50)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Mejores hiperparámetros:
{'knn__metric': 'minkowski', 'knn__n_neighbors': 35, 'knn__p': 2, 'knn__weights': 'distance'}
Mejor F1 (CV): 0.7600


## Evaluación del modelo


In [54]:
from sklearn.metrics import f1_score

y_pred_validation = grid.predict(X_validation_final)
y_pred_train = grid.predict(X_train_final)

print(f"F1 validacion: {f1_score(y_validation, y_pred_validation):.4f}")
print(f"F1 entrenamiento: {f1_score(y_train, y_pred_train):.4f}")
print(f"Gap: {f1_score(y_train, y_pred_train) - f1_score(y_validation, y_pred_validation):.4f}")

F1 validacion: 0.7814
F1 entrenamiento: 0.9840
Gap: 0.2026


In [55]:
from sklearn.metrics import f1_score
import numpy as np

best_model = grid.best_estimator_

val_proba = best_model.predict_proba(X_validation_final)[:, 1]

thresholds = np.linspace(0.01, 0.99, 200)
f1_scores = [(thr, f1_score(y_validation, (val_proba >= thr).astype(int)))
             for thr in thresholds]

best_thr, best_f1 = max(f1_scores, key=lambda x: x[1])

print(f"Best threshold on validation: {best_thr:.3f}")
print(f"F1 on validation with best threshold: {best_f1:.4f}")

def predict_with_threshold(model, X, thr=best_thr):
    proba = model.predict_proba(X)[:, 1]
    return (proba >= thr).astype(int)

Best threshold on validation: 0.517
F1 on validation with best threshold: 0.7808


## Predicciones en el conjunto de test

In [56]:
testeo_processed = testeo.copy()

testeo_processed['text_orig'] = testeo_processed['text'].astype(str)
testeo_processed['text'] = testeo_processed['text'].apply(preprocess_text)
testeo_processed['keyword'] = testeo_processed['keyword'].apply(preprocess_keyword)
testeo_processed['location'] = testeo_processed['location'].apply(preprocess_location)
testeo_processed['text_raw'] = testeo_processed['text'].apply(lambda tokens: ' '.join(tokens))

testeo_processed['keyword'] = testeo_processed['keyword'].fillna('unknown')
testeo_processed['location'] = testeo_processed['location'].fillna('unknown')

X_test_embeddings = np.vstack([
    text_to_embedding(tokens, word_vectors, embedding_dim)
    for tokens in testeo_processed['text']
])

X_test_final = np.hstack([X_test_embeddings])

X_test_final = np.nan_to_num(X_test_final)

print(f"Matriz final test: {X_test_final.shape}")

y_pred_test = grid.predict(X_test_final)

submission = pd.DataFrame({
    'id': testeo['id'],
    'target': y_pred_test
})

submission.to_csv('submission_knn.csv', index=False)
print(f"Submission guardado: {len(y_pred_test)} predicciones")
print(f"Distribucion - 0: {(y_pred_test == 0).sum()}, 1: {(y_pred_test == 1).sum()}")

Matriz final test: (3263, 50)
Submission guardado: 3263 predicciones
Distribucion - 0: 1960, 1: 1303
