In [67]:
#https://data-flair.training/blogs/advanced-python-project-detecting-fake-news/ code from here
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import string
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import nltk

In [68]:
# nltk.download()

In [69]:
#Read the data
hoax_data =pd.read_csv('clean_hoax.csv')
real_data = pd.read_csv('clean_real.csv')
hoax_data['label']=['False']*len(hoax_data)
real_data['label']=['True']*len(real_data)

df=hoax_data.append(real_data).sample(frac=1).reset_index().drop(columns=['index'])
df

Unnamed: 0,title_name,label
0,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,True
1,Bersantap di Hotel Berbintang yang Menerapkan ...,True
2,Dalih Arteta soal Jebloknya Arsenal: Badai Ced...,True
3,"Tarik Dana Haji, BPKH Ancam Masyarakat Tak Da...",False
4,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,False
...,...,...
8270,Varian Omicron Mendominasi Kasus Covid-19 di I...,True
8271,"Aturan Keluar, PNS cs Dilarang Cuti Natal-Tahu...",True
8272,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,False
8273,Kemenkes: Daerah Tak Capai Target Vaksin Beris...,True


In [70]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['clean_msg']= df['title_name'].apply(lambda x:remove_punctuation(x))
df.head()

Unnamed: 0,title_name,label,clean_msg
0,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,True,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...
1,Bersantap di Hotel Berbintang yang Menerapkan ...,True,Bersantap di Hotel Berbintang yang Menerapkan ...
2,Dalih Arteta soal Jebloknya Arsenal: Badai Ced...,True,Dalih Arteta soal Jebloknya Arsenal Badai Cede...
3,"Tarik Dana Haji, BPKH Ancam Masyarakat Tak Da...",False,Tarik Dana Haji BPKH Ancam Masyarakat Tak Dap...
4,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,False,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...


In [71]:
df['msg_lower']= df['clean_msg'].apply(lambda x: x.lower())
df

Unnamed: 0,title_name,label,clean_msg,msg_lower
0,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,True,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,jokowi soroti kesenjangan akses vaksin negara ...
1,Bersantap di Hotel Berbintang yang Menerapkan ...,True,Bersantap di Hotel Berbintang yang Menerapkan ...,bersantap di hotel berbintang yang menerapkan ...
2,Dalih Arteta soal Jebloknya Arsenal: Badai Ced...,True,Dalih Arteta soal Jebloknya Arsenal Badai Cede...,dalih arteta soal jebloknya arsenal badai cede...
3,"Tarik Dana Haji, BPKH Ancam Masyarakat Tak Da...",False,Tarik Dana Haji BPKH Ancam Masyarakat Tak Dap...,tarik dana haji bpkh ancam masyarakat tak dap...
4,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,False,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,surat tim publikasi dirgahayu hut ke 28 tahun...
...,...,...,...,...
8270,Varian Omicron Mendominasi Kasus Covid-19 di I...,True,Varian Omicron Mendominasi Kasus Covid19 di In...,varian omicron mendominasi kasus covid19 di in...
8271,"Aturan Keluar, PNS cs Dilarang Cuti Natal-Tahu...",True,Aturan Keluar PNS cs Dilarang Cuti NatalTahun ...,aturan keluar pns cs dilarang cuti nataltahun ...
8272,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,False,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,jokowi keluarkan kartu corona indonesia sehat...
8273,Kemenkes: Daerah Tak Capai Target Vaksin Beris...,True,Kemenkes Daerah Tak Capai Target Vaksin Berisi...,kemenkes daerah tak capai target vaksin berisi...


In [72]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('\W+',text)
    while("" in tokens) :
        tokens.remove("")
    return tokens

def freqword(token):
    freq_tokens = nltk.FreqDist(token)
    return freq_tokens.most_common()
#applying function to the column
df['msg_tokenized']= df['msg_lower'].apply(lambda x: tokenization(x))
df['msg_tokenize_freq']= df['msg_tokenized'].apply(lambda x: freqword(x))
df.head()


Unnamed: 0,title_name,label,clean_msg,msg_lower,msg_tokenized,msg_tokenize_freq
0,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,True,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,jokowi soroti kesenjangan akses vaksin negara ...,"[jokowi, soroti, kesenjangan, akses, vaksin, n...","[(jokowi, 1), (soroti, 1), (kesenjangan, 1), (..."
1,Bersantap di Hotel Berbintang yang Menerapkan ...,True,Bersantap di Hotel Berbintang yang Menerapkan ...,bersantap di hotel berbintang yang menerapkan ...,"[bersantap, di, hotel, berbintang, yang, mener...","[(bersantap, 1), (di, 1), (hotel, 1), (berbint..."
2,Dalih Arteta soal Jebloknya Arsenal: Badai Ced...,True,Dalih Arteta soal Jebloknya Arsenal Badai Cede...,dalih arteta soal jebloknya arsenal badai cede...,"[dalih, arteta, soal, jebloknya, arsenal, bada...","[(dalih, 1), (arteta, 1), (soal, 1), (jeblokny..."
3,"Tarik Dana Haji, BPKH Ancam Masyarakat Tak Da...",False,Tarik Dana Haji BPKH Ancam Masyarakat Tak Dap...,tarik dana haji bpkh ancam masyarakat tak dap...,"[tarik, dana, haji, bpkh, ancam, masyarakat, t...","[(tarik, 1), (dana, 1), (haji, 1), (bpkh, 1), ..."
4,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,False,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,surat tim publikasi dirgahayu hut ke 28 tahun...,"[surat, tim, publikasi, dirgahayu, hut, ke, 28...","[(surat, 1), (tim, 1), (publikasi, 1), (dirgah..."


In [73]:
from nltk.corpus import stopwords
list_stopwords = set(stopwords.words('indonesian'))
def remove_stopwords(text):
    output= [i for i in text if i not in list_stopwords]
    return output
df['no_stopwords']= df['msg_tokenized'].apply(lambda x:remove_stopwords(x))
df


Unnamed: 0,title_name,label,clean_msg,msg_lower,msg_tokenized,msg_tokenize_freq,no_stopwords
0,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,True,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,jokowi soroti kesenjangan akses vaksin negara ...,"[jokowi, soroti, kesenjangan, akses, vaksin, n...","[(jokowi, 1), (soroti, 1), (kesenjangan, 1), (...","[jokowi, soroti, kesenjangan, akses, vaksin, n..."
1,Bersantap di Hotel Berbintang yang Menerapkan ...,True,Bersantap di Hotel Berbintang yang Menerapkan ...,bersantap di hotel berbintang yang menerapkan ...,"[bersantap, di, hotel, berbintang, yang, mener...","[(bersantap, 1), (di, 1), (hotel, 1), (berbint...","[bersantap, hotel, berbintang, menerapkan, pro..."
2,Dalih Arteta soal Jebloknya Arsenal: Badai Ced...,True,Dalih Arteta soal Jebloknya Arsenal Badai Cede...,dalih arteta soal jebloknya arsenal badai cede...,"[dalih, arteta, soal, jebloknya, arsenal, bada...","[(dalih, 1), (arteta, 1), (soal, 1), (jeblokny...","[dalih, arteta, jebloknya, arsenal, badai, ced..."
3,"Tarik Dana Haji, BPKH Ancam Masyarakat Tak Da...",False,Tarik Dana Haji BPKH Ancam Masyarakat Tak Dap...,tarik dana haji bpkh ancam masyarakat tak dap...,"[tarik, dana, haji, bpkh, ancam, masyarakat, t...","[(tarik, 1), (dana, 1), (haji, 1), (bpkh, 1), ...","[tarik, dana, haji, bpkh, ancam, masyarakat, b..."
4,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,False,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,surat tim publikasi dirgahayu hut ke 28 tahun...,"[surat, tim, publikasi, dirgahayu, hut, ke, 28...","[(surat, 1), (tim, 1), (publikasi, 1), (dirgah...","[surat, tim, publikasi, dirgahayu, hut, 28, ko..."
...,...,...,...,...,...,...,...
8270,Varian Omicron Mendominasi Kasus Covid-19 di I...,True,Varian Omicron Mendominasi Kasus Covid19 di In...,varian omicron mendominasi kasus covid19 di in...,"[varian, omicron, mendominasi, kasus, covid19,...","[(varian, 1), (omicron, 1), (mendominasi, 1), ...","[varian, omicron, mendominasi, covid19, inggris]"
8271,"Aturan Keluar, PNS cs Dilarang Cuti Natal-Tahu...",True,Aturan Keluar PNS cs Dilarang Cuti NatalTahun ...,aturan keluar pns cs dilarang cuti nataltahun ...,"[aturan, keluar, pns, cs, dilarang, cuti, nata...","[(aturan, 1), (keluar, 1), (pns, 1), (cs, 1), ...","[aturan, pns, cs, dilarang, cuti, nataltahun]"
8272,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,False,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,jokowi keluarkan kartu corona indonesia sehat...,"[jokowi, keluarkan, kartu, corona, indonesia, ...","[(jokowi, 1), (keluarkan, 1), (kartu, 1), (cor...","[jokowi, keluarkan, kartu, corona, indonesia, ..."
8273,Kemenkes: Daerah Tak Capai Target Vaksin Beris...,True,Kemenkes Daerah Tak Capai Target Vaksin Berisi...,kemenkes daerah tak capai target vaksin berisi...,"[kemenkes, daerah, tak, capai, target, vaksin,...","[(kemenkes, 1), (daerah, 1), (tak, 1), (capai,...","[kemenkes, daerah, capai, target, vaksin, beri..."


In [74]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()


def stemword(tokens):
    output= [stemmer.stem(token) for token in tokens]
    return output

df['stemword']= df['no_stopwords'].apply(lambda x:stemword(x))
df

Unnamed: 0,title_name,label,clean_msg,msg_lower,msg_tokenized,msg_tokenize_freq,no_stopwords,stemword
0,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,True,Jokowi Soroti Kesenjangan Akses Vaksin Negara ...,jokowi soroti kesenjangan akses vaksin negara ...,"[jokowi, soroti, kesenjangan, akses, vaksin, n...","[(jokowi, 1), (soroti, 1), (kesenjangan, 1), (...","[jokowi, soroti, kesenjangan, akses, vaksin, n...","[jokowi, sorot, senjang, akses, vaksin, negara..."
1,Bersantap di Hotel Berbintang yang Menerapkan ...,True,Bersantap di Hotel Berbintang yang Menerapkan ...,bersantap di hotel berbintang yang menerapkan ...,"[bersantap, di, hotel, berbintang, yang, mener...","[(bersantap, 1), (di, 1), (hotel, 1), (berbint...","[bersantap, hotel, berbintang, menerapkan, pro...","[santap, hotel, bintang, terap, prokes, lengkap]"
2,Dalih Arteta soal Jebloknya Arsenal: Badai Ced...,True,Dalih Arteta soal Jebloknya Arsenal Badai Cede...,dalih arteta soal jebloknya arsenal badai cede...,"[dalih, arteta, soal, jebloknya, arsenal, bada...","[(dalih, 1), (arteta, 1), (soal, 1), (jeblokny...","[dalih, arteta, jebloknya, arsenal, badai, ced...","[dalih, arteta, jeblok, arsenal, badai, cedera..."
3,"Tarik Dana Haji, BPKH Ancam Masyarakat Tak Da...",False,Tarik Dana Haji BPKH Ancam Masyarakat Tak Dap...,tarik dana haji bpkh ancam masyarakat tak dap...,"[tarik, dana, haji, bpkh, ancam, masyarakat, t...","[(tarik, 1), (dana, 1), (haji, 1), (bpkh, 1), ...","[tarik, dana, haji, bpkh, ancam, masyarakat, b...","[tarik, dana, haji, bpkh, ancam, masyarakat, h..."
4,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,False,Surat Tim Publikasi Dirgahayu HUT Ke 28 Tahun...,surat tim publikasi dirgahayu hut ke 28 tahun...,"[surat, tim, publikasi, dirgahayu, hut, ke, 28...","[(surat, 1), (tim, 1), (publikasi, 1), (dirgah...","[surat, tim, publikasi, dirgahayu, hut, 28, ko...","[surat, tim, publikasi, dirgahayu, hut, 28, ko..."
...,...,...,...,...,...,...,...,...
8270,Varian Omicron Mendominasi Kasus Covid-19 di I...,True,Varian Omicron Mendominasi Kasus Covid19 di In...,varian omicron mendominasi kasus covid19 di in...,"[varian, omicron, mendominasi, kasus, covid19,...","[(varian, 1), (omicron, 1), (mendominasi, 1), ...","[varian, omicron, mendominasi, covid19, inggris]","[varian, omicron, dominasi, covid19, inggris]"
8271,"Aturan Keluar, PNS cs Dilarang Cuti Natal-Tahu...",True,Aturan Keluar PNS cs Dilarang Cuti NatalTahun ...,aturan keluar pns cs dilarang cuti nataltahun ...,"[aturan, keluar, pns, cs, dilarang, cuti, nata...","[(aturan, 1), (keluar, 1), (pns, 1), (cs, 1), ...","[aturan, pns, cs, dilarang, cuti, nataltahun]","[atur, pns, cs, larang, cuti, nataltahun]"
8272,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,False,Jokowi Keluarkan Kartu Corona Indonesia Sehat...,jokowi keluarkan kartu corona indonesia sehat...,"[jokowi, keluarkan, kartu, corona, indonesia, ...","[(jokowi, 1), (keluarkan, 1), (kartu, 1), (cor...","[jokowi, keluarkan, kartu, corona, indonesia, ...","[jokowi, keluar, kartu, corona, indonesia, seh..."
8273,Kemenkes: Daerah Tak Capai Target Vaksin Beris...,True,Kemenkes Daerah Tak Capai Target Vaksin Berisi...,kemenkes daerah tak capai target vaksin berisi...,"[kemenkes, daerah, tak, capai, target, vaksin,...","[(kemenkes, 1), (daerah, 1), (tak, 1), (capai,...","[kemenkes, daerah, capai, target, vaksin, beri...","[kemenkes, daerah, capai, target, vaksin, risi..."


In [117]:
# join list of token as single document string
import ast

def join_text_list(texts):
    texts = ast.literal_eval(str(texts))
    return ' '.join([text for text in texts])
df["finaltext"] = df["stemword"].apply(join_text_list)


In [118]:
#DataFlair - Get the labels
labels=df.label
labels.head()

0     True
1     True
2     True
3    False
4    False
Name: label, dtype: object

In [119]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['finaltext'], labels, test_size=0.2, random_state=7)

In [120]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train).toarray()
tfidf_test=tfidf_vectorizer.transform(x_test).toarray()

In [121]:
terms = tfidf_vectorizer.get_feature_names()
# sum tfidf frequency of each term through documents
sums = tfidf_train.sum(axis=0)
# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
ranking.sort_values('rank', ascending=False)



Unnamed: 0,term,rank
1542,covid19,242.469410
1529,corona,198.074252
6383,vaksin,184.793411
5116,ri,166.132776
4330,omicron,139.248518
...,...,...
3135,kening,0.261364
3557,leser,0.261364
2895,kabel,0.261364
1547,covit,0.261364


In [155]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

# max_features = 1500

# calc TF vector
cvect = CountVectorizer()
TF_vector = cvect.fit_transform(x_train)

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(smooth_idf=False)
tfs = tfidf.fit_transform(x_train)
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat_train = normalized_TF_vector.multiply(IDF_vector).toarray()

tfidf_mat_train[1]
x_train[1]

'santap hotel bintang terap prokes lengkap'

In [156]:
# max_features = 1500

# calc TF vector
cvect = CountVectorizer()
TF_vector = cvect.fit_transform(x_test)

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(smooth_idf=False)
tfs = tfidf.fit_transform(x_test)
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat_test = normalized_TF_vector.multiply(IDF_vector).toarray()

In [179]:
#DataFlair - Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#DataFlair - Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')
#DataFlair - Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['True','False'])


Accuracy: 92.21%


array([[1391,   51],
       [  78,  135]], dtype=int64)

In [154]:
max_features = 1500
predict_x = pd.DataFrame({"title_name": ["Gagal Jantung hingga Strok adalah Gejala Omicron", "Terdapat Parasit yang Mematikan di Dalam Vaksin covid19"]})
predict_x['msg_tokenized']= predict_x['title_name'].apply(lambda x: tokenization(x))
predict_x["finaltext"] = predict_x["msg_tokenized"].apply(join_text_list)
final_predict = predict_x["finaltext"]

# calc TF vector
cvect = CountVectorizer(max_features=max_features)
TF_vector = cvect.fit_transform(final_predict)

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
tfs = tfidf.fit_transform(final_predict)
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat_predict_x = normalized_TF_vector.multiply(IDF_vector).toarray()
tfidf_mat_predict_x

array([[0.24187817, 0.        , 0.        , 0.        , 0.24187817,
        0.24187817, 0.24187817, 0.24187817, 0.        , 0.24187817,
        0.        , 0.24187817, 0.        , 0.        , 0.        ],
       [0.        , 0.2116434 , 0.2116434 , 0.2116434 , 0.        ,
        0.        , 0.        , 0.        , 0.2116434 , 0.        ,
        0.2116434 , 0.        , 0.2116434 , 0.2116434 , 0.2116434 ]])

In [182]:
predict_x = pd.DataFrame({"title_name": ["Gagal Jantung hingga Strok adalah Gejala Omicron", "Terdapat Parasit yang Mematikan di Dalam Vaksin covid19"]})
predict_x['msg_tokenized']= predict_x['title_name'].apply(lambda x: tokenization(x))
predict_x["finaltext"] = predict_x["msg_tokenized"].apply(join_text_list)
final_predict = predict_x["finaltext"]
tfidf_predict=tfidf_vectorizer.transform(final_predict).toarray()
print(pac.predict(tfidf_predict))

['True' 'True']
