# Neural Network

## Lectura de datos

In [7]:
import pandas as pd
import nltk
import gensim
import sqlite3

sqlite_db = "./dataset/fake_news.sqlite"

conn = sqlite3.connect(sqlite_db)
df = pd.read_sql_query("SELECT * FROM NEWS", conn)
conn.close()

#### Total de palabras en el Dataset

In [8]:
list_of_words = []
unique_words = set()
for document in df_news.clean_joined:
    for word in gensim.utils.simple_preprocess(document):
        list_of_words.append(word)
        unique_words.add(word)
        
total_words = len(list_of_words)  # total words
unique_words = len(unique_words)   # total unique words
print("Total words:" + str(total_words) + " unique_words:" + str(unique_words))

Total words:8844981 unique_words:96148


## Partición del dataset

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.clean_joined, df.isfake, test_size = 0.2)

In [10]:
from nltk import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = total_words)
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_padded = pad_sequences(x_train_seq, maxlen = 40, padding = 'post', truncating = 'post')
x_test_padded = pad_sequences(x_test_seq, maxlen = 40, truncating = 'post') 

In [6]:
print(x_train_seq[0])

[10357, 1915, 2356, 165, 15633, 789, 337, 22764, 41012, 9333, 293, 1894, 982, 26, 2356, 10357, 1915, 1450, 5, 489, 131, 44, 22764, 1772, 1915, 131, 306, 2167, 54, 3841, 1392, 1772, 514, 697, 165, 6699, 16024, 683, 834, 1180, 834, 31, 1772, 55208, 76, 31, 3011, 1772, 208, 1666, 288, 390, 1772, 1772, 834, 3163, 3223, 778, 67, 3494, 55209, 242, 208, 2356, 10357, 1915, 489, 131, 204, 489, 1641, 78, 70, 694, 2876, 66, 41012, 22764, 650, 11806, 1153, 2356, 1915, 22764, 865, 8500, 884, 30, 1212, 277, 2122, 3011, 3301, 49, 22764, 247, 3, 7145, 1751, 2025, 10126, 86, 208, 65, 7, 3011, 2031, 1772, 4575, 11806, 2985, 1888, 1460, 2098, 292, 179, 163, 204, 784, 468, 3176, 3841, 98, 288, 149, 3494, 22764, 98, 149, 3494, 36, 1188, 4, 697, 834, 250, 107, 250, 15633, 149, 149, 3494, 2356, 349, 212, 4, 848, 232, 10357, 1915, 164, 1221, 1309, 955, 22764, 860, 2398, 171, 1188, 22764, 1116, 184, 26, 2356, 1915, 1666, 7, 191, 4, 319, 1612, 22, 4, 402, 209, 589, 191, 161, 1666, 26, 275, 119, 53, 536, 3112, 4

In [7]:
print(x_train_padded[0])

[10357  1915  2356   165 15633   789   337 22764 41012  9333   293  1894
   982    26  2356 10357  1915  1450     5   489   131    44 22764  1772
  1915   131   306  2167    54  3841  1392  1772   514   697   165  6699
 16024   683   834  1180]


In [8]:
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))

## Construcción del modelo

In [9]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, Embedding

In [10]:
modelo=Sequential()

In [11]:
modelo.add(Embedding(total_words,input_length=40, output_dim = 128))
modelo.add(Flatten())
modelo.add(Dense(units=64, activation='relu'))
# Adding dropout to prevent overfitting (regularización)
modelo.add(Dropout(0.2)) # 10% out in each epoc
modelo.add(Dense(units=32, activation='relu'))
# Adding dropout to prevent overfitting (regularización)
modelo.add(Dropout(0.2))
modelo.add(Dense(units=16, activation='relu'))
modelo.add(Dropout(0.2))
modelo.add(Dense(units=1, activation='sigmoid'))

In [12]:
modelo.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
modelo.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 128)           1132157568
_________________________________________________________________
flatten (Flatten)            (None, 5120)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                327744    
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                5

In [None]:
history = modelo.fit(x_train_padded, y_train, batch_size=2, epochs=2,validation_split = 0.2)

## Evaluación

In [None]:
prediction = model.predict(x_test_padded)

In [None]:
prediction_list = []
for i in range(len(prediction)):
    if prediction[i].item() > 0.5:
        prediction_list.append(1)
    else:
        prediction_list.append(0)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(list(y_test), prediction_list)

print("Model Accuracy : ", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

cm = confusion_matrix(list(y_test), prediction_list, labels=[0,1])

# group_names = ['True Neg','False Pos','False Neg','True Pos']
group_names = ['Verdaderos Positivos','Falsos Positivo','Falsos Negativos','Verdaderos Negativos']
group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

precision = cm[1,1] / sum(cm[:,1])
recall    = cm[1,1] / sum(cm[1,:])
f1_score  = 2*precision*recall / (precision + recall)
stats_text = "\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(accuracy,precision,recall,f1_score)

ax= plt.subplot()
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', ax = ax); #annot=True to annotate cells

ax.set_xlabel('Predicted label' + stats_text)

# labels, title and ticks
ax.set_xlabel("Valores predicción \n\n Métricas" + stats_text)
ax.set_ylabel('Valores Reales')
ax.set_title('Matriz de confusión')
ax.xaxis.set_ticklabels(['Reales(0)', 'Falsas(1)'])
ax.yaxis.set_ticklabels(['Reales(0)', 'Falsas(1)'])