# Punto 5
Fake News identification in COVID public discussion. Type: Different approaches.

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#install stopwords
nltk.download()

In [54]:
import pandas as pd
import json
import gensim
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
# Lib Metrics
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [55]:
prefix = './drive/MyDrive/datasets/spanish/'
fake_ruta = prefix + 'fake_clean.json'
true_ruta = prefix + 'real_clean.json'
# fake2_ruta = prefix + 'gpt2.json'

fake_news = pd.read_json(fake_ruta, lines=True)
fake_news['tag'] = 0
true_news = pd.read_json(true_ruta, lines=True)
true_news['tag'] = 1
# fake_news_2 = pd.read_json(fake2_ruta, lines=True)
# fake_news_2['tag'] = 0
frames = [fake_news, true_news]
df = pd.concat(frames)
df.head(10)

Unnamed: 0,text,tag,language
0,Una publicación afirma que la vacunación oblig...,0,es
1,En una foto se afirma que esta persona es un m...,0,es
2,Publicación sobre un vídeo afirma que se trata...,0,es
3,Todas las muertes por insuficiencia respirator...,0,es
4,El decano del Colegio de Biólogos de Euskadi a...,0,es
5,Los hogares con pacientes con COVID-19 en Port...,0,es
6,Una cadena enumera recomendaciones para preven...,0,es
7,60.000 empresas argentinas han cerrado por COV...,0,es
8,Publicaciones en redes sociales critican una f...,0,es
9,Los CDC han publicado una actualización sobre ...,0,es


## Processing

In [56]:
import re
import string
stop = stopwords.words('spanish')

def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
df['text']=df['text'].apply(lambda x:review_cleaning(x))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df

Unnamed: 0,text,tag,language
0,publicación afirma vacunación obligatoria viol...,0,es
1,foto afirma persona médico falleció tras atend...,0,es
2,publicación vídeo afirma trata protesta encier...,0,es
3,todas muertes insuficiencia respiratoria neumo...,0,es
4,decano colegio biólogos euskadi afirma gran ca...,0,es
...,...,...,...
7257,industria pesquera windsoressex cerró temporad...,1,es
7258,trabajador health canada da positivo,1,es
7259,taxis adaptan pantallas plástico límite pasajeros,1,es
7260,hombre increíble padre joven identificado prim...,1,es


In [57]:
from tensorflow.keras.preprocessing.text import one_hot
#One hot encoding 
voc_size = 14535
onehot_repr=[one_hot(words,voc_size)for words in df['text']]

## Splitting Datasets

In [58]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Padding the sentences
embedded_docs=pad_sequences(onehot_repr,padding='pre')
print(embedded_docs.shape)

(16989, 62)


In [59]:
# Converting the X and y as array
X_final=np.array(embedded_docs)
y_final=np.array(df['tag'])

#Check shape of X and y final
X_final.shape,y_final.shape

((16989, 62), (16989,))

In [60]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.33, random_state=42)

## Model

In [61]:
#Creating the lstm model
sent_length = 5000
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100)) #Adding 100 lstm neurons in the layer
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

#Compiling the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5000, 40)          581400    
_________________________________________________________________
dropout_4 (Dropout)          (None, 5000, 40)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 637,901
Trainable params: 637,901
Non-trainable params: 0
_________________________________________________________________
None


In [62]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efb8dd655d0>

## Evaluation of the model

In [63]:
# Predicting from test data
y_pred=model.predict_classes(X_test)


#Creating confusion matrix
#confusion_matrix(y_test,y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)
# plot_confusion_matrix(cm,classes=['Fake','True'])
print(cm)





[[3045  202]
 [ 286 2074]]


In [64]:
accuracy_score(y_test,y_pred)

0.9129659354378455

In [65]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.91      0.94      0.93      3247
           1       0.91      0.88      0.89      2360

    accuracy                           0.91      5607
   macro avg       0.91      0.91      0.91      5607
weighted avg       0.91      0.91      0.91      5607



In [66]:
voc_size = 14535
arr = [
       'Google lancia la nuova mappa globale Covid-19 per i giornalisti journalist',
]
onehot_repr=[one_hot(words,voc_size)for words in arr]
embedded_docs=pad_sequences(onehot_repr,padding='pre')
model.predict(embedded_docs, batch_size=64)



array([[0.9960425]], dtype=float32)