## Khai báo thư viện 

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM, GlobalMaxPooling1D

In [33]:
# bert_preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
# bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True)

## ISOT Fake News Dataset
Link: https://onlineacademiccommunity.uvic.ca/isot/2022/11/27/fake-news-detection-datasets/

### Đọc dữ liệu

In [34]:
True_data = pd.read_csv("datasets/ISOT Fake News Dataset/True.csv")
Fake_data = pd.read_csv("datasets/ISOT Fake News Dataset/Fake.csv")
True_data['label'] = 0
Fake_data['label'] = 1
print(True_data.shape)
print(Fake_data.shape)

(21417, 5)
(23481, 5)


In [35]:
df_concat = pd.concat([True_data, Fake_data], axis=0)
df_concat.shape

(44898, 5)

### Text processing

In [36]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
sw = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\intern.thccong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
# Defining contractions
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}

In [38]:
# Remove contraction
def remove_contractions(text):
    return contractions[text.lower()] if text.lower() in contractions.keys() else text

In [39]:
# Function remove_tags to handles HTML tags
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''
    return TAG_RE.sub('', text) 

In [40]:
def cleaned_text(df):
    # convert lower
    df['cleaned_text'] = df['text'].str.lower()
    # Remove html tags
    df['cleaned_text'] = df['cleaned_text'].apply(remove_tags)
    # remove punctuations
    df['cleaned_text'] = df['cleaned_text'].str.replace('[^\w\s]', '', regex=True)
    # remove numeric characters
    df['cleaned_text'] = df['cleaned_text'].str.replace('\d', '', regex=True)
    # remove contractions
    df['cleaned_text'] = df['cleaned_text'].apply(remove_contractions)
    # remove stopwords
    df['cleaned_text'] = df['cleaned_text'].apply(lambda row: " ".join(x for x in row.split() if x not in sw))
    # remove new line
    df['cleaned_text'] = df['cleaned_text'].apply(lambda row: row.replace('\n', ''))

    return df

df_concat = cleaned_text(df_concat)
df_concat[['text', 'cleaned_text']].iloc[0]

text            WASHINGTON (Reuters) - The head of a conservat...
cleaned_text    washington reuters head conservative republica...
Name: 0, dtype: object

In [41]:
df_concat = df_concat.sample(frac = 1)
df_concat.head(10)

Unnamed: 0,title,text,subject,date,label,cleaned_text
11363,MEALS ON WHEELS Shuts the Lyin’ Lefties Up Wit...,Below is the comment in red that MoveOn.org pu...,politics,"Mar 19, 2017",1,comment red moveonorg put video notice say not...
14539,HYSTERICAL! HERE’S HILLARY Yelling It Out On T...,It s been said that Hillary Clinton isn t lika...,politics,"Feb 4, 2016",1,said hillary clinton likable well exhibit peop...
15821,HILLARY CLINTON: We All Know She’s Deceitful A...,"She was an unethical, dishonest lawyer. She c...",politics,"Apr 15, 2015",1,unethical dishonest lawyer conspired violate c...
16225,Brazil's president recovering after prostate s...,RIO DE JANEIRO (Reuters) - Brazilian President...,worldnews,"October 28, 2017",0,rio de janeiro reuters brazilian president mic...
10765,Texas official to retire after criticizing Pla...,"AUSTIN, Texas (Reuters) - A senior Texas healt...",politicsNews,"February 19, 2016",0,austin texas reuters senior texas health offic...
702,U.S. Senator Flake concerned about tax plan im...,WASHINGTON (Reuters) - U.S. Republican Senator...,politicsNews,"November 9, 2017",0,washington reuters us republican senator jeff ...
13220,U.S. firms invited to bid for Saudi nuclear pl...,RIYADH (Reuters) - Saudi Arabia has invited U....,worldnews,"December 4, 2017",0,riyadh reuters saudi arabia invited us firms t...
6079,Top Gun Lobbyist Threatens To Use The ‘Bullet...,A top gun lobbyist has just warned Americans t...,News,"June 1, 2016",1,top gun lobbyist warned americans conservative...
12924,GRAB THE POPCORN! Queen Of Corruption DENIED S...,The Drudge Report has gained access to the rul...,politics,"Sep 24, 2016",1,drudge report gained access rules upcoming meg...
19927,BUSTED! LIBERAL BRAINIACS Steal Trump Sign…In ...,This is just idiotic! A couple of liberal brai...,left-news,"Sep 24, 2016",1,idiotic couple liberal brainiacs decided steal...


### Build vocabulary

In [89]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df_concat['cleaned_text'])

In [90]:
vocab_size = len(tokenizer.word_index) + 1
sequence_length = 1000
embedding_dim = 300

### Chia tập train và test

In [45]:
train_sentences, test_sentences, train_label, test_label = train_test_split(
    df_concat['cleaned_text'], df_concat['label'], test_size=0.2, stratify=df_concat['label']
    )

In [46]:
train_labels = np.array(train_label)
test_label = np.array(test_label)

In [91]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [92]:
padded_train_sequences = pad_sequences(train_sequences, maxlen=sequence_length, truncating='post', padding='post')
padded_test_sequences = pad_sequences(test_sequences, maxlen=sequence_length, truncating='post', padding='post')

In [93]:
padded_train_sequences

array([[    36,     12,      4, ...,      0,      0,      0],
       [    36,     12,      4, ...,      0,      0,      0],
       [  2868,   5464,   3735, ...,      0,      0,      0],
       ...,
       [  3670,     12,    324, ...,      0,      0,      0],
       [116558,      3,   5046, ...,      0,      0,      0],
       [  2683,     12,     21, ...,      0,      0,      0]])

In [94]:
padded_train_sequences.shape

(35918, 1000)

### Build model

#### TF-IDF + Linear SVM

In [52]:
# # Create tokenization and modelling pipeline
# model_0 = Pipeline([
#     ("tfidf", TfidfVectorizer()), # Convert words to numbers using tfidf
#     ("SVC", SVC()) # model the text
# ])

# # Fit the pipeline to the training data
# model_0.fit(train_sentences, train_label)

In [53]:
# # Evaluate baseline model
# base_line_score = model_0.score(test_sentences, test_label)
# print(f"accuracy of baseline model: {base_line_score*100:.2f}")

In [54]:
test_sentences

11673    washingtontegucigalpa reuters united states we...
4085     simple tweet stephen king latest post donald t...
10308    remember media used think controlled narrative...
7108     reuters us republican presidentelect donald tr...
18400    tamara holder guest joy reid msnbc show mornin...
                               ...                        
4330     palm beach flawashington reuters top white hou...
5367     donald trump touting prolgbt candidate going k...
4803     washington reuters us state department said tu...
1639     new york reuters us president donald trump sai...
3235     washington reuters us senators reached agreeme...
Name: cleaned_text, Length: 8980, dtype: object

#### TF-IDF + Decision tree

In [55]:
# # Create tokenization and modelling pipeline
# model_1 = Pipeline([
#     ("tfidf", TfidfVectorizer()), 
#     ("decision tree", DecisionTreeClassifier())
# ])

# # Fit the pipeline to the training data
# model_1.fit(train_sentences, train_label)

In [56]:
# # Evaluate baseline model
# base_line_score = model_1.score(test_sentences, test_label)
# print(f"accuracy of baseline model: {base_line_score*100:.2f}")

#### fastText + CNN

#### GloVe + ResNet

In [69]:
embeddings_dictionary = dict()
glove_file = open('pretrained model/glove.840B.300d.txt', encoding="utf8")
for line in glove_file:
    records = line.split(' ')
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[str(word)] = vector_dimensions
glove_file.close()

In [70]:
embeddings_dictionary.__len__()

2196016

In [95]:
# Tạo Embedding Matrix có 300 columns 
# Bao gồm 300-dimensional GloVe word embeddings cho tất cả các từ trong vocab.
embedding_matrix = np.zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

#### GloVe + Bidirectional LSTM

In [96]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=sequence_length))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 300)         63464100  
                                                                 
 bidirectional (Bidirection  (None, 1000, 512)         1140736   
 al)                                                             
                                                                 
 global_max_pooling1d (Glob  (None, 512)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 64670629 (246.70 MB)
Trainable params: 646

In [98]:
model.fit(padded_train_sequences, train_labels, batch_size=64, epochs=6)

Epoch 1/6
  4/562 [..............................] - ETA: 3:51:37 - loss: 0.3790 - accuracy: 0.8438

## Fake news dataset
**Link:** https://www.kaggle.com/competitions/fake-news/overview <br>
**Dataset Description** <br>

**train.csv:** A full training dataset with the following attributes:

- **id:** unique id for a news article
- **title:** the title of a news article
- **author:** author of the news article
- **text:** the text of the article; could be incomplete
- **label:** a label that marks the article as potentially unreliable
    - 1: unreliable
    - 0: reliable
    
**test.csv:** A testing training dataset with all the same attributes at train.csv without the label.

In [None]:
Fake_news_data = pd.read_csv("datasets/Fake news dataset/train.csv")
print(Fake_news_data.shape)
Fake_news_data.head(10)

(20800, 5)


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


## Fake or real news dataset
This is the repo hosting the data for my fake vs real news project. The data is in a zipped csv file and contains almost 11000 articles tagged as either real or fake.

In [None]:
Fake_or_real_news_data = pd.read_csv("datasets/Fake or real news dataset/fake_or_real_news.csv")
print(Fake_or_real_news_data.shape)
Fake_or_real_news_data.head(10)

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


## Fake news detection dataset

In [None]:
Fake_news_detection_data = pd.read_csv("datasets/Fake news detection dataset/data.csv")
print(Fake_news_detection_data.shape)
Fake_news_detection_data.head(10)

(4009, 4)


Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
5,http://beforeitsnews.com/sports/2017/09/jetnat...,JetNation FanDuel League; Week 4,JetNation FanDuel League; Week 4\n% of readers...,0
6,https://www.nytimes.com/2017/10/10/us/politics...,Kansas Tried a Tax Plan Similar to Trump’s. It...,"In 2012, Kansas lawmakers, led by Gov. Sam Bro...",1
7,https://www.reuters.com/article/us-india-cenba...,"India RBI chief: growth important, but not at ...",The Reserve Bank of India (RBI) Governor Urjit...,1
8,https://www.reuters.com/article/us-climatechan...,EPA chief to sign rule on Clean Power Plan exi...,"Scott Pruitt, Administrator of the U.S. Enviro...",1
9,https://www.reuters.com/article/us-air-berlin-...,Talks on sale of Air Berlin planes to easyJet ...,FILE PHOTO - An Air Berlin sign is seen at an ...,1
