In [44]:
import pandas as pd
pd.set_option('max_colwidth', -1)
import string
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix

[nltk_data] Downloading package stopwords to /home/ndv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
df_train = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df_train.drop(columns=['location', 'keyword'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)

In [42]:
# Quitamos las urls
df_train['text'] = df_train['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)
df_test['text'] = df_test['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [48]:
train_text = df_train['text']
train_target = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(train_text, train_target, test_size=0.20, random_state=17)

# Probaremos con unigramas primero por lo que dejaremos el ngram_range en su valor default.
# Sólo agregaremos al vocabulario aquellas palabras que aparecen más de 5 veces para filtrar palabras
# poco frecuentes.
tfidf = TfidfVectorizer(min_df=5)
X_train_vecs = tfidf.fit_transform(X_train)
X_test_vecs = tfidf.transform(X_test)

In [51]:
X_train_vecs.shape, y_train.shape, X_test_vecs.shape, y_test.shape

((6090, 2169), (6090,), (1523, 2169), (1523,))

In [52]:
# Ahora usaremos XGBoost para intentar predecir el target de los tweets utilizando
# este nuevo formato como features.

param_grid = { 
    'n_estimators': [x for x in range(10, 100, 10)],
    'max_depth': [3, 5, 10],
    'colsample_bytree': [0.5, 0.7, 0.9]
}

xgb_classifier = xgb.XGBClassifier(random_state=17, learning_rate=0.1)
CV_xgb = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='f1', n_jobs=4)
CV_xgb.fit(X_train_vecs, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrain...
                                     random_state=17, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             iid='warn'

In [53]:
CV_xgb.best_params_

{'colsample_bytree': 0.7, 'max_depth': 10, 'n_estimators': 90}

In [54]:
def try_classifier(classifier):
    predictions = classifier.predict(X_test_vecs)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'f1 score: {f1_score(y_test,  predictions)}')

In [55]:
try_classifier(CV_xgb)

Verdaderos Negativos: 798
Falsos Negativos: 320
Verdaderos Positivos: 347
Falsos Positivos: 58

f1 score: 0.6473880597014926


In [59]:
# Ahora usaremos todos los datos del set de entrenamiento para entrenar el modelo que dio los
# mejores resultados y luego haremos la predicción para el set de test.
best_xgb = CV_xgb.best_estimator_

tfidf_pred = TfidfVectorizer(min_df=5)
train_text_vecs = tfidf_pred.fit_transform(train_text)
test_text_vecs = tfidf_pred.transform(df_test['text'])

best_xgb.fit(train_text_vecs, train_target)
kaggle_pred = best_xgb.predict(test_text_vecs)

In [62]:
df_test['target'] = kaggle_pred
df_test.head()

Unnamed: 0,id,text,target
0,0,happened terrible car crash,1
1,2,heard earthquake different cities stay safe everyone,1
2,3,forest fire spot pond geese fleeing across street cannot save,1
3,9,apocalypse lighting spokane wildfires,0
4,11,typhoon soudelor kills 28 china taiwan,1


In [65]:
df_test[['id', 'target']].to_csv('../submits/xgboost_tfidf.csv', index=False)

In [66]:
best_xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=90, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=17, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)