In [6]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from xgboost import XGBClassifier




In [3]:
def mostrar_metricas(y_test, y_pred):
    print('Reporte de clasificación: \n', metrics.classification_report(y_test, y_pred))

    # Obtener y reformar la matriz de datos de 
    matrix = metrics.confusion_matrix (y_test, y_pred) 
    matrix = matrix.astype ('float') / matrix.sum (axis = 1) [:, np.newaxis] 
    fig = px.imshow(matrix, 
                labels=dict(x="Predicción", y="Valor real", color="Porcentaje"),
                x=['Fake', 'Real'],
                y=['Fake', 'Real'])
    fig.update_layout(title_text='Matriz de confusión')
    fig.show()
def resultados(pred, test_df):
    res_df=pd.DataFrame(test_df['id'])
    res_df['target']=pred
    res_df.to_csv('data/submission.csv', index=False)

ENTRENAMIENTO

In [17]:
train_df = pd.read_csv('data/train_fttd.csv', encoding='utf-8')
test_df = pd.read_csv('data/test_fttd.csv', encoding='utf-8')

X_train = train_df['text_clean']
y = train_df['target']
X_test = test_df['text_clean']
#Split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 7506)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# TF-IDF
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

#Entrenamiento
xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic')

#Params
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 7, 15]
        }

folds = 3
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 7506)
random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=7506 )
random_search.fit(X_train, y)

print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:   46.8s finished



 All results:
{'mean_fit_time': array([ 7.85948896,  8.7770737 , 13.1001416 , 13.81533925, 11.53697244]), 'std_fit_time': array([0.53891724, 1.78368019, 0.17600835, 0.1108395 , 1.22256217]), 'mean_score_time': array([0.13046304, 0.09901396, 0.12388309, 0.11425098, 0.10026781]), 'std_score_time': array([0.02128783, 0.02101525, 0.0115644 , 0.00713818, 0.02702409]), 'param_subsample': masked_array(data=[0.6, 0.8, 1.0, 0.8, 0.6],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[1, 10, 10, 1, 10],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[5, 3, 4, 4, 5],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[1.5, 1, 2, 0.5, 1.5],
             mask=[False, False, False, False, False],
       fill_value='?',
  

In [18]:
y_pred = random_search.predict_proba(X_test)

In [19]:
#pred = np.round(y_pred).astype(int)
pred = np.argmax(y_pred, axis=1)

resultados(pred, test_df)