In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('fake_dataset.csv')
df.head()

In [None]:
print("No of missing title\t:", df[df['date'].isna()].shape[0])
print("No of missing text\t:", df[df['name'].isna()].shape[0])
print("No of missing source\t:", df[df['tweet'].isna()].shape[0])
print("No of missing label\t:", df[df['Label'].isna()].shape[0])

In [None]:
df = df.fillna('')

In [None]:
df['date_name_tweet'] = df['date'] + ' ' + df['name'] + ' ' + df['tweet']
df.head()

In [None]:
df = df[df['Label']!='']
print(df['Label'].unique())

In [None]:
df.loc[df['label'] == 'fake', 'label'] = 'FAKE'
df.loc[df['label'] == 'Fake', 'label'] = 'FAKE'

In [None]:
no_of_fakes = df.loc[df['label'] == 'FAKE'].count()[0]
no_of_trues = df.loc[df['label'] == 'TRUE'].count()[0]
print(no_of_fakes)
print(no_of_trues)

In [None]:
df['title_text_source'] = df['title_text_source'].apply(clean)

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['date_name_tweet'].values)
X = X.toarray()

In [None]:
y = df['Label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=11)

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

In [None]:
predictions = clf.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions)

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=['FAKE', 'TRUE'], yticklabels=['FAKE', 'TRUE'], cmap=plt.cm.Blues, cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()