In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
#Read the data
hoax_data =pd.read_csv('clean_hoax.csv')
real_data = pd.read_csv('clean_real.csv').head(2000)
hoax_data['label']=['False']*len(hoax_data)
real_data['label']=['True']*len(real_data)

df=hoax_data.append(real_data).sample(frac=1).reset_index().drop(columns=['index'])
df

Unnamed: 0,title_name,label
0,Pesan Berantai “Wisma Atlet penuh”,False
1,Trump Dicemooh Gegara Sudah Disuntik Booster V...,True
2,"Sean Gelael Ikut Danau Toba Rally, Ayahnya Jug...",True
3,"Kasus Corona RI 2 Desember Tambah 311, Sembuh ...",True
4,"Menko PMK, Menhub-Ganjar Rapat Persiapan Natar...",True
...,...,...
3070,Organisasi Dokter Setuju RI Ikut Larang Warga ...,True
3071,Literasi COVID-19 dari drh. Moh. Indro Cahyono,False
3072,Badai Cedera Chelsea (Pelan-pelan) Berlalu,True
3073,Update Lengkap Corona RI 10 Desember: Jabar Te...,True


In [3]:
#DataFlair - Get the labels
labels=df.label
labels.head()

0    False
1     True
2     True
3     True
4     True
Name: label, dtype: object

In [4]:
#DataFlair - Split the dataset
x_train,x_test,y_train,y_test=train_test_split(df['title_name'], labels, test_size=0.2, random_state=7)

In [5]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#DataFlair - Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [6]:
mnb = MultinomialNB() 
mnb.fit(tfidf_train, y_train)
pred = mnb.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print(f'Accuracy: {round(score*100,2)}%')
#DataFlair - Build confusion matrix
confusion_matrix(y_test,pred, labels=['True','False'])

Accuracy: 87.64%


array([[395,   3],
       [ 73, 144]], dtype=int64)

In [11]:
predict_x = pd.DataFrame({"title_name": ["Gagal Jantung hingga Strok adalah Gejala Omicron"]})
tfidf_test=tfidf_vectorizer.transform(predict_x).toarray()
print(tfidf_test)
print(mnb.predict(tfidf_test))

[[0. 0. 0. ... 0. 0. 0.]]
['True']
