In [8]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression



In [9]:
def mostrar_metricas(y_test, y_pred):
    print('Reporte de clasificación: \n', metrics.classification_report(y_test, y_pred))

    # Obtener y reformar la matriz de datos de 
    matrix = metrics.confusion_matrix (y_test, y_pred) 
    matrix = matrix.astype ('float') / matrix.sum (axis = 1) [:, np.newaxis] 
    fig = px.imshow(matrix, 
                labels=dict(x="Predicción", y="Valor real", color="Porcentaje"),
                x=['Fake', 'Real'],
                y=['Fake', 'Real'])
    fig.update_layout(title_text='Matriz de confusión')
    fig.show()
def resultados(pred, test_df):
    res_df=pd.DataFrame(test_df['id'])
    res_df['target']=pred
    res_df.to_csv('data/submission.csv', index=False)

ENTRENAMIENTO

In [29]:
train_df = pd.read_csv('data/train_fttd.csv', encoding='utf-8')
test_df = pd.read_csv('data/test_fttd.csv', encoding='utf-8')

X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 7506)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)



# TF-IDF
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

#Entrenamiento/Busco parámetros buenos
scores = []
cs = []
for i in np.arange(1, 3, 0.1):
    LR = LogisticRegression(penalty='l2',  C=i, max_iter=100)
    LR.fit(X_train, y_train)
    scores.append(LR.score(X_test, y_test))
    cs.append(i)

GRAFICO

In [30]:
#Grafico
fig = go.Figure()
fig.add_trace(go.Scatter(x=cs, y=scores,
                    mode='lines+markers'))
fig.update_layout(title_text='Logistic Regression: Exactitud en función de C',
                  xaxis_title='C',
                  yaxis_title='Accuracy',
                  showlegend=False,
                  template="plotly")
fig.show()

TEST

In [27]:
X_train = train_df['text_clean']
y_train = train_df['target']
X_test = test_df['text_clean']

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# TF-IDF
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)


LR = LogisticRegression(penalty='l2',  C=1.8, max_iter=100)
LR.fit(X_train, y_train)

LogisticRegression(C=1.8, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

SALIDA

In [28]:
y_pred = LR.predict(X_test)

#Salida
resultados(y_pred, test_df)