# Punto 5
Fake News identification in COVID public discussion. Type: Different approaches.

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#install stopwords
nltk.download()

In [6]:
import pandas as pd
import json
import gensim
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
# Lib Metrics
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [12]:
prefix = './drive/MyDrive/datasets/italian/'
fake_ruta = prefix + 'fake_clean.json'
true_ruta = prefix + 'real_clean.json'
# fake2_ruta = prefix + 'gpt2.json'

fake_news = pd.read_json(fake_ruta, lines=True)
fake_news['tag'] = 0
true_news = pd.read_json(true_ruta, lines=True)
true_news['tag'] = 1
# fake_news_2 = pd.read_json(fake2_ruta, lines=True)
# fake_news_2['tag'] = 0
frames = [fake_news, true_news]
df = pd.concat(frames)
df.head(10)

Unnamed: 0,text,tag,language
0,Un post afferma che la vaccinazione obbligator...,0,it
1,Una foto afferma che questa persona è un medic...,0,it
2,Post su un video sostiene che si tratta di una...,0,it
3,Tutti i decessi per insufficienza respiratoria...,0,it
4,Il preside del College of Biologists of Euskad...,0,it
5,Le famiglie con pazienti COVID-19 a Porto Aleg...,0,it
6,Una catena elenca le raccomandazioni per preve...,0,it
7,60.000 aziende argentine hanno chiuso a causa ...,0,it
8,I post sui social media criticano una foto del...,0,it
9,CDC ha rilasciato un aggiornamento su come può...,0,it


## Processing

In [28]:
import re
import string
stop = stopwords.words('italian')

def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
df['text']=df['text'].apply(lambda x:review_cleaning(x))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df

Unnamed: 0,text,tag,language
0,post afferma vaccinazione obbligatoria viola p...,0,it
1,foto afferma persona medico morto dopo aver as...,0,it
2,post video sostiene tratta protesta confinazio...,0,it
3,decessi insufficienza respiratoria polmonite v...,0,it
4,preside college biologists euskadi afferma mol...,0,it
...,...,...,...
7257,lindustria pesca windsoressex stata chiusa dur...,1,it
7258,lavoratore health canada risulta positivo,1,it
7259,taxi adattano scudi plastica limite passeggeri,1,it
7260,uomo straordinario giovane padre stato identif...,1,it


In [29]:
from tensorflow.keras.preprocessing.text import one_hot
#One hot encoding 
voc_size = 14535
onehot_repr=[one_hot(words,voc_size)for words in df['text']]

## Splitting Datasets

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Padding the sentences
embedded_docs=pad_sequences(onehot_repr,padding='pre')
print(embedded_docs.shape)

(16989, 69)


In [31]:
# Converting the X and y as array
X_final=np.array(embedded_docs)
y_final=np.array(df['tag'])

#Check shape of X and y final
X_final.shape,y_final.shape

((16989, 69), (16989,))

In [32]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.33, random_state=42)

## Model

In [33]:
#Creating the lstm model
sent_length = 5000
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100)) #Adding 100 lstm neurons in the layer
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

#Compiling the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5000, 40)          581400    
_________________________________________________________________
dropout_2 (Dropout)          (None, 5000, 40)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               56400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 637,901
Trainable params: 637,901
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efb9d3ad210>

## Evaluation of the model

In [35]:
# Predicting from test data
y_pred=model.predict_classes(X_test)


#Creating confusion matrix
#confusion_matrix(y_test,y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)
# plot_confusion_matrix(cm,classes=['Fake','True'])
print(cm)





[[2993  254]
 [ 281 2079]]


In [36]:
accuracy_score(y_test,y_pred)

0.9045835562689495

In [37]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.91      0.92      0.92      3247
           1       0.89      0.88      0.89      2360

    accuracy                           0.90      5607
   macro avg       0.90      0.90      0.90      5607
weighted avg       0.90      0.90      0.90      5607



In [52]:
voc_size = 14535
arr = [
       'Google lancia la nuova mappa globale Covid-19 per i giornalisti journalist',
]
onehot_repr=[one_hot(words,voc_size)for words in arr]
embedded_docs=pad_sequences(onehot_repr,padding='pre')
model.predict(embedded_docs, batch_size=64)

array([[0.49541575]], dtype=float32)