In [2]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [3]:
#Read the data
hoax_data =pd.read_csv('clean_hoax.csv')
real_data = pd.read_csv('clean_real.csv').head(2000)
hoax_data['label']=['False']*len(hoax_data)
real_data['label']=['True']*len(real_data)

df=hoax_data.append(real_data).sample(frac=1).reset_index().drop(columns=['index'])
df

Unnamed: 0,title_name,label
0,Air Mata Keluarga Iringi Peresmian Monumen Pah...,True
1,“SEHARI SETELAH DILAKUKAN RAPID TEST COVID-19...,False
2,Pakar Sebut Dibukanya Kembali KCW Mampu Dongkr...,True
3,Ilmuwan Sebut Pandemi Setelah COVID-19 Bisa Le...,True
4,Kasus Pertama Varian Omicron RI Tertular dari ...,True
...,...,...
3070,Kondisi Seorang Remaja Perempuan Setelah Vaks...,False
3071,BPOM Ungkap Perkembangan Terkini Vaksin Merah ...,True
3072,"Omicron Menghantui, Thailand Tangguhkan Wisata...",True
3073,"Belanda Catat 23 Ribu Kasus Corona, Rusia Lunc...",True


In [4]:
#DataFlair - Get the labels
labels=df.label
labels.head()

0     True
1    False
2     True
3     True
4     True
Name: label, dtype: object

In [5]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['title_name'], labels, test_size=0.2, random_state=7)

In [6]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [7]:
mnb = MultinomialNB() 
mnb.fit(tfidf_train, y_train)
pred = mnb.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print(f'Accuracy: {round(score*100,2)}%')
#DataFlair - Build confusion matrix
confusion_matrix(y_test,pred, labels=['True','False'])

Accuracy: 88.94%


array([[393,   5],
       [ 63, 154]], dtype=int64)

In [24]:
predict_x = pd.DataFrame({"title_name": ["Gagal Jantung hingga Strok adalah Gejala Omicron"]})
tfidf_test=tfidf_vectorizer.transform(predict_x)
print(mnb.predict(tfidf_test))

['True']
