In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics



In [7]:
def mostrar_metricas(y_test, y_pred):
    print('Reporte de clasificación: \n', metrics.classification_report(y_test, y_pred))

    # Obtener y reformar la matriz de datos de 
    matrix = metrics.confusion_matrix (y_test, y_pred) 
    matrix = matrix.astype ('float') / matrix.sum (axis = 1) [:, np.newaxis] 
    fig = px.imshow(matrix, 
                labels=dict(x="Predicción", y="Valor real", color="Porcentaje"),
                x=['Fake', 'Real'],
                y=['Fake', 'Real'])
    fig.update_layout(title_text='Matriz de confusión')
    fig.show()
def resultados(pred, test_df):
    res_df=pd.DataFrame(test_df['id'])
    res_df['target']=pred
    res_df.to_csv('data/submission.csv', index=False)

In [6]:
train_df = pd.read_csv('data/train_fttd.csv', encoding='utf-8')
test_df = pd.read_csv('data/test_fttd.csv', encoding='utf-8')

X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 7506)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# TF-IDF 
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

#Train
KNN = KNeighborsClassifier(n_neighbors = 79, metric='minkowski')
KNN.fit(X_train, y_train)

#Métricas
y_pred = KNN.predict(X_test)
mostrar_metricas(y_test, y_pred)


#Buscando un K 'bueno'
k_range = range(1,100, 3)
scores = []


for k in k_range:
    KNN = KNeighborsClassifier(n_neighbors = k, metric='minkowski')
    KNN.fit(X_train, y_train)
    scores.append(KNN.score(X_test, y_test))

#Grafico
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(k_range), y=scores,
                    mode='lines+markers'))
fig.update_layout(title_text='KNN: Exactitud en función de K',
                  xaxis_title='K',
                  yaxis_title='Accuracy',
                  showlegend=False,
                  template="plotly")

fig.show()

Reporte de clasificación: 
               precision    recall  f1-score   support

           0       0.74      0.88      0.81      1061
           1       0.81      0.61      0.70       843

    accuracy                           0.76      1904
   macro avg       0.77      0.75      0.75      1904
weighted avg       0.77      0.76      0.76      1904



TEST

In [10]:
X_train = train_df['text_clean']
y_train = train_df['target']
X_test = test_df['text_clean']

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# TF-IDF
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

#Train
KNN = KNeighborsClassifier(n_neighbors = 25, metric='minkowski')
KNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                     weights='uniform')

In [11]:
y_pred = KNN.predict(X_test)

#Salida
resultados(y_pred, test_df)