In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

np.random.seed(5)

entrenamiento = pd.read_csv('train.csv')
testeo = pd.read_csv('test.csv')
muestra = pd.read_csv('sample_submission.csv')

In [2]:
import re
import string
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords

def preprocess_keyword(keyword):
    if isinstance(keyword, str):
        processed_keyword = keyword.replace('%20', '_')
        return processed_keyword
    return keyword

def preprocess_location(location):
    if isinstance(location, str):
        processed_location = location.replace('%20', '_')
        return processed_location
    return location

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'http\S+|www\.\S+', ' URL ', text)

    text = text.replace('!', ' EXCLAMATION ').replace('?', ' QUESTION ')

    keep = set('#@')
    drop = ''.join(ch for ch in string.punctuation if ch not in keep)
    text = text.translate(str.maketrans('', '', drop))

    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Split entre entrenamiento y validación

In [3]:
entrenamiento['text'] = entrenamiento['text'].apply(preprocess_text)
entrenamiento['keyword'] = entrenamiento['keyword'].apply(preprocess_keyword)
entrenamiento['location'] = entrenamiento['location'].apply(preprocess_location)

X = entrenamiento.drop(['target'], axis=1)
y = entrenamiento['target']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.10, stratify=y, random_state=5)

## Búsqueda de hiperparámetros

In [4]:
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold

naive_model = ComplementNB()

param_grid = {
    'alpha': np.logspace(-3, 1, 12),
    'fit_prior': [True, False],
    'norm': [True, False]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=naive_model,
    param_grid=param_grid,
    scoring=make_scorer(f1_score),
    cv=cv,
    n_jobs=-1,
    verbose=0
)

## Creación de features numéricas

In [5]:
def create_numeric_features(df):
    df_copy = df.copy()
    t = df_copy['text'].astype(str)

    df_copy['text_length']      = t.str.len()
    df_copy['word_count']       = t.str.split().str.len()
    df_copy['hashtag_count']    = t.str.count('#')
    df_copy['mention_count']    = t.str.count('@')
    df_copy['url_count']        = t.str.count(r'\burl\b')
    df_copy['exclamation_count']= t.str.count(r'\bexclamation\b')
    df_copy['question_count']   = t.str.count(r'\bquestion\b')
    df_copy['caps_word_count']  = 0
    df_copy['unique_word_ratio']= t.apply(lambda x: len(set(x.split()))/max(1,len(x.split())))
    df_copy['avg_word_length']  = t.apply(lambda x: (sum(len(w) for w in x.split())/max(1,len(x.split()))))
    df_copy['has_numbers']      = t.str.contains(r'\d').astype(int)
    df_copy['has_time_reference']= t.str.contains(r'\b(now|today|tonight|yesterday|urgent|breaking)\b').astype(int)

    disaster_keywords = ['fire','burn','crash','kill','dead','destroy','emergency','evacuate','disaster','damage']
    df_copy['disaster_word_count'] = t.apply(lambda x: sum(kw in x for kw in disaster_keywords))

    df_copy['elongated_token'] = t.str.contains(r'(.)\1\1+').astype(int)
    return df_copy

X_train = create_numeric_features(X_train)
X_validation = create_numeric_features(X_validation)

print("Features numéricas creadas:")
print(X_train[['text_length', 'word_count', 'hashtag_count', 'exclamation_count']].head())

Features numéricas creadas:
      text_length  word_count  hashtag_count  exclamation_count
5286           77          16              0                  0
3193          121          18              3                  0
5272           72          12              1                  0
6816          111          17              0                  0
2996          112          20              0                  0


  df_copy['has_time_reference']= t.str.contains(r'\b(now|today|tonight|yesterday|urgent|breaking)\b').astype(int)
  df_copy['elongated_token'] = t.str.contains(r'(.)\1\1+').astype(int)
  df_copy['has_time_reference']= t.str.contains(r'\b(now|today|tonight|yesterday|urgent|breaking)\b').astype(int)
  df_copy['elongated_token'] = t.str.contains(r'(.)\1\1+').astype(int)


## Creación de features categóricas

In [6]:
def create_categorical_features(df):
    df_copy = df.copy()

    df_copy['keyword'] = df_copy['keyword'].fillna('no_keyword')
    df_copy['location'] = df_copy['location'].fillna('no_location')

    df_copy['has_keyword'] = (df_copy['keyword'] != 'no_keyword').astype(int)
    df_copy['has_location'] = (df_copy['location'] != 'no_location').astype(int)

    return df_copy

X_train = create_categorical_features(X_train)
X_validation = create_categorical_features(X_validation)

print("\nFeatures categóricas creadas:")
print(X_train[['keyword', 'location', 'has_keyword', 'has_location']].head())


Features categóricas creadas:
             keyword     location  has_keyword  has_location
5286        outbreak  no_location            1             0
3193  emergency_plan      Indiana            1             1
5272       oil_spill  no_location            1             0
6816         trapped      Orlando            1             1
2996      dust_storm  Lubbock, TX            1             1


## Embedding del texto

In [7]:
from sklearn.model_selection import ParameterGrid

X_train['text_str'] = X_train['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))
X_validation['text_str'] = X_validation['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

tfidf = TfidfVectorizer(
    max_features=None,
    min_df=2,
    max_df=0.90,
    ngram_range=(1, 3),
    sublinear_tf=True,
    lowercase=True,
    strip_accents='unicode',
    analyzer='word',
    norm='l2',
    use_idf=True,
    stop_words=None
)

char_tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3, 6),
    min_df=3,
    sublinear_tf=True,
    lowercase=True,
    strip_accents='unicode'
)

param_grid_nb = {
    'alpha': np.logspace(-3, 1, 12),
    'fit_prior': [True, False],
    'norm': [True, False]
}
param_grid_vec = [
    {'tfidf_stop': 'none'},
    {'tfidf_stop': 'english'}
]

selected_stop_words_option = 'none'

tfidf.stop_words = None if selected_stop_words_option == 'none' else 'english'
print(f"Using stop_words: {tfidf.stop_words}")

X_train_tfidf = tfidf.fit_transform(X_train['text_str'])
X_validation_tfidf = tfidf.transform(X_validation['text_str'])

X_train_char = char_tfidf.fit_transform(X_train['text_str'])
X_validation_char = char_tfidf.transform(X_validation['text_str'])

print(f"\nTF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Vocabulario de {len(tfidf.vocabulary_)} palabras")

Using stop_words: None

TF-IDF matrix shape: (6851, 22330)
Vocabulario de 22330 palabras


## One Hot Encoding

In [8]:
from scipy.sparse import csr_matrix

TOPK = 200
top_keywords = X_train['keyword'].value_counts().head(TOPK).index
X_train_kw = X_train['keyword'].where(X_train['keyword'].isin(top_keywords), 'OTHER')
X_val_kw   = X_validation['keyword'].where(X_validation['keyword'].isin(top_keywords), 'OTHER')

kw_train_dum = pd.get_dummies(X_train_kw, prefix='kw', drop_first=False)
kw_val_dum   = pd.get_dummies(X_val_kw,   prefix='kw', drop_first=False)

missing_cols = set(kw_train_dum.columns) - set(kw_val_dum.columns)
for c in missing_cols:
    kw_val_dum[c] = 0
kw_val_dum = kw_val_dum[kw_train_dum.columns]

KW_train = csr_matrix(kw_train_dum.values.astype(int))
KW_val   = csr_matrix(kw_val_dum.values.astype(int))

print("Keyword OHE (capped) shapes:", KW_train.shape, KW_val.shape)

Keyword OHE (capped) shapes: (6851, 201) (762, 201)


## Mean Encoding

In [9]:
location_means = X_train.join(y_train).groupby('location')['target'].mean()

keyword_means = X_train.join(y_train).groupby('keyword')['target'].mean()

global_mean = y_train.mean()
smoothing = 60

def mean_encode_with_smoothing(value, means_dict, global_mean, smoothing, df_ref, col_name):
    if value in means_dict:
        count = (df_ref[col_name] == value).sum()
        return (means_dict[value] * count + global_mean * smoothing) / (count + smoothing)
    return global_mean

X_train['location_mean_encoded'] = X_train['location'].apply(
    lambda x: mean_encode_with_smoothing(x, location_means, global_mean, smoothing, X_train, 'location')
)

X_validation['location_mean_encoded'] = X_validation['location'].apply(
    lambda x: mean_encode_with_smoothing(x, location_means, global_mean, smoothing, X_train, 'location')
)

X_train['keyword_mean_encoded'] = X_train['keyword'].apply(
    lambda x: mean_encode_with_smoothing(x, keyword_means, global_mean, smoothing, X_train, 'keyword')
)

X_validation['keyword_mean_encoded'] = X_validation['keyword'].apply(
    lambda x: mean_encode_with_smoothing(x, keyword_means, global_mean, smoothing, X_train, 'keyword')
)

print(f"\nMean Encoding aplicado a location y keyword")
print(X_train[['location', 'location_mean_encoded', 'keyword', 'keyword_mean_encoded']].head(10))


Mean Encoding aplicado a location y keyword
                        location  location_mean_encoded            keyword  \
5286                 no_location               0.423231           outbreak   
3193                     Indiana               0.402861     emergency_plan   
5272                 no_location               0.423231          oil_spill   
6816                     Orlando               0.422674            trapped   
2996                 Lubbock, TX               0.439067         dust_storm   
5438                 no_location               0.423231             police   
1203  KurveZ@GearHeadCentral.net               0.422674  buildings_burning   
3712                 no_location               0.423231               fear   
983                     New York               0.326265           body_bag   
3148            Indianapolis, IN               0.431985          emergency   

      keyword_mean_encoded  
5286              0.632076  
3193              0.497642  
5272     

## Entrenamiento del modelo

In [10]:
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_selection import SelectKBest, chi2

numeric_features = ['text_length', 'word_count', 'hashtag_count', 'mention_count',
                   'exclamation_count', 'question_count', 'caps_word_count',
                   'unique_word_ratio', 'avg_word_length',
                   'has_keyword', 'has_location',
                   'location_mean_encoded', 'keyword_mean_encoded']

kbd = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')

X_train_num_binned = kbd.fit_transform(X_train[numeric_features].values)
X_val_num_binned   = kbd.transform(X_validation[numeric_features].values)

X_train_combined = hstack([
    X_train_tfidf,
    X_train_char,
    KW_train,
    X_train_num_binned])

X_validation_combined = hstack([
    X_validation_tfidf,
    X_validation_char,
    KW_val,
    X_val_num_binned])

print(f"\nDimensiones finales del conjunto de entrenamiento: {X_train_combined.shape}")
print(f"Dimensiones finales del conjunto de validación: {X_validation_combined.shape}")

selector = SelectKBest(score_func=chi2, k=100000)
X_train_sel = selector.fit_transform(X_train_combined, y_train)
X_val_sel   = selector.transform(X_validation_combined)

print("After chi2 selection:", X_train_sel.shape, X_val_sel.shape)

X_train_combined = X_train_sel
X_validation_combined = X_val_sel

print("\nEntrenando modelo con GridSearchCV...")
grid.fit(X_train_combined, y_train)

print(f"\nMejores parámetros: {grid.best_params_}")
print(f"Mejor score F1 (CV): {grid.best_score_:.4f}")

best_model = grid.best_estimator_


Dimensiones finales del conjunto de entrenamiento: (6851, 69199)
Dimensiones finales del conjunto de validación: (762, 69199)
After chi2 selection: (6851, 69199) (762, 69199)

Entrenando modelo con GridSearchCV...





Mejores parámetros: {'alpha': np.float64(0.3511191734215131), 'fit_prior': True, 'norm': False}
Mejor score F1 (CV): 0.8167


## Evaluación del modelo

In [11]:
print(f"Mejores parámetros: {grid.best_params_}")

Mejores parámetros: {'alpha': np.float64(0.3511191734215131), 'fit_prior': True, 'norm': False}


In [12]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from scipy.sparse import hstack, csr_matrix

X_validation_combined_raw = hstack([
    X_validation_tfidf,
    X_validation_char,
    KW_val,
    X_val_num_binned])

X_validation_combined = selector.transform(X_validation_combined_raw)

print(f"\nReconstructed X_validation_combined shape: {X_validation_combined.shape}")

y_pred_validation = best_model.predict(X_validation_combined)
y_pred_training = best_model.predict(X_train_combined)

print("Reporte de clasificación:")
print(classification_report(y_validation, y_pred_validation))

print("\nMatriz de confusión:")
print(confusion_matrix(y_validation, y_pred_validation))

print(f"\nF1-Score en validación: {f1_score(y_validation, y_pred_validation):.4f}")

print("\nReporte de clasificación (conjunto de entrenamiento):")
print(classification_report(y_train, y_pred_training))

print("\nMatriz de confusión (conjunto de entrenamiento):")
print(confusion_matrix(y_train, y_pred_training))

print(f"\nF1-Score en entrenamiento: {f1_score(y_train, y_pred_training):.4f}")


Reconstructed X_validation_combined shape: (762, 69199)
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       435
           1       0.84      0.75      0.79       327

    accuracy                           0.83       762
   macro avg       0.83      0.82      0.83       762
weighted avg       0.83      0.83      0.83       762


Matriz de confusión:
[[390  45]
 [ 83 244]]

F1-Score en validación: 0.7922

Reporte de clasificación (conjunto de entrenamiento):
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      3907
           1       0.93      0.82      0.87      2944

    accuracy                           0.90      6851
   macro avg       0.90      0.89      0.89      6851
weighted avg       0.90      0.90      0.89      6851


Matriz de confusión (conjunto de entrenamiento):
[[3715  192]
 [ 526 2418]]

F1-Score en entrenamiento: 0.8707


In [13]:
from sklearn.metrics import f1_score
import numpy as np

val_proba = best_model.predict_proba(X_validation_combined)[:, 1]

thresholds = np.linspace(0.01, 0.99, 200)
f1_scores = [(thr, f1_score(y_validation, (val_proba >= thr).astype(int)))
             for thr in thresholds]

best_thr, best_f1 = max(f1_scores, key=lambda x: x[1])

print(f"Best threshold on validation: {best_thr:.3f}")
print(f"F1 on validation with best threshold: {best_f1:.4f}")

def predict_with_threshold(model, X, thr=best_thr):
    proba = model.predict_proba(X)[:, 1]
    return (proba >= thr).astype(int)

Best threshold on validation: 0.394
F1 on validation with best threshold: 0.8006


## Predicciones en el conjunto de test

In [14]:
testeo['text'] = testeo['text'].apply(preprocess_text)
testeo['keyword'] = testeo['keyword'].apply(preprocess_keyword)
testeo['location'] = testeo['location'].apply(preprocess_location)

testeo = create_numeric_features(testeo)
testeo = create_categorical_features(testeo)

testeo['text_str'] = testeo['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

X_test_tfidf = tfidf.transform(testeo['text_str'])
X_test_char = char_tfidf.transform(testeo['text_str'])

X_test_kw = testeo['keyword'].where(testeo['keyword'].isin(top_keywords), 'OTHER')
kw_test_dum = pd.get_dummies(X_test_kw, prefix='kw', drop_first=False)

for c in kw_train_dum.columns:
    if c not in kw_test_dum.columns:
        kw_test_dum[c] = 0
kw_test_dum = kw_test_dum[kw_train_dum.columns]
KW_test = csr_matrix(kw_test_dum.values.astype(int))

testeo['location_mean_encoded'] = testeo['location'].apply(
    lambda x: mean_encode_with_smoothing(x, location_means, global_mean, smoothing, X_train, 'location')
)

testeo['keyword_mean_encoded'] = testeo['keyword'].apply(
    lambda x: mean_encode_with_smoothing(x, keyword_means, global_mean, smoothing, X_train, 'keyword')
)

X_test_num_binned = kbd.transform(testeo[numeric_features].values)

X_test_combined = hstack([
    X_test_tfidf,
    X_test_char,
    KW_test,
    X_test_num_binned
])

print(f"Dimensiones del conjunto de test (antes de selección): {X_test_combined.shape}")

X_test_combined = selector.transform(X_test_combined)
print(f"Dimensiones del conjunto de test (después de selección): {X_test_combined.shape}")

predictions = predict_with_threshold(best_model, X_test_combined, best_thr)

submission = pd.DataFrame({
    'id': testeo['id'],
    'target': predictions
})

submission.to_csv('submission_naive_bayes.csv', index=False)
print("\nArchivo 'submission_naive_bayes.csv' creado exitosamente!")
print(f"Distribución de predicciones: {pd.Series(predictions).value_counts()}")

  df_copy['has_time_reference']= t.str.contains(r'\b(now|today|tonight|yesterday|urgent|breaking)\b').astype(int)
  df_copy['elongated_token'] = t.str.contains(r'(.)\1\1+').astype(int)


Dimensiones del conjunto de test (antes de selección): (3263, 69199)
Dimensiones del conjunto de test (después de selección): (3263, 69199)

Archivo 'submission_naive_bayes.csv' creado exitosamente!
Distribución de predicciones: 0    2001
1    1262
Name: count, dtype: int64
