## Fake News Detection

In [52]:
import numpy as np
import pandas as pd
import pickle

In [2]:
path = "E:\Projects\/fake-news-detection-project"
true_df = pd.read_csv(path + '\True.csv')
fake_df = pd.read_csv(path + '\Fake.csv')
loaded_model = pickle.load(open(path + '\model.pkl', 'rb'))

In [53]:
true_df['label'] = 0
true_df['label'] = true_df['label'].astype(np.uint8)

In [54]:
fake_df['label'] = 1
fake_df['label'] = fake_df['label'].astype(np.uint8)

In [55]:
true_df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [56]:
fake_df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1


In [57]:
true_df = true_df[['text','label']]
fake_df = fake_df[['text','label']]

In [58]:
dataset = pd.concat([true_df , fake_df])
dataset.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [59]:
dataset.info

<bound method DataFrame.info of                                                     text  label
0      WASHINGTON (Reuters) - The head of a conservat...      0
1      WASHINGTON (Reuters) - Transgender people will...      0
2      WASHINGTON (Reuters) - The special counsel inv...      0
3      WASHINGTON (Reuters) - Trump campaign adviser ...      0
4      SEATTLE/WASHINGTON (Reuters) - President Donal...      0
...                                                  ...    ...
23476  21st Century Wire says As 21WIRE reported earl...      1
23477  21st Century Wire says It s a familiar theme. ...      1
23478  Patrick Henningsen  21st Century WireRemember ...      1
23479  21st Century Wire says Al Jazeera America will...      1
23480  21st Century Wire says As 21WIRE predicted in ...      1

[44898 rows x 2 columns]>

In [65]:
dataset.label = dataset.label.astype(np.uint8)
dataset.info

<bound method DataFrame.info of                                                     text  label
0      WASHINGTON (Reuters) - The head of a conservat...      0
1      WASHINGTON (Reuters) - Transgender people will...      0
2      WASHINGTON (Reuters) - The special counsel inv...      0
3      WASHINGTON (Reuters) - Trump campaign adviser ...      0
4      SEATTLE/WASHINGTON (Reuters) - President Donal...      0
...                                                  ...    ...
23476  21st Century Wire says As 21WIRE reported earl...      1
23477  21st Century Wire says It s a familiar theme. ...      1
23478  Patrick Henningsen  21st Century WireRemember ...      1
23479  21st Century Wire says Al Jazeera America will...      1
23480  21st Century Wire says As 21WIRE predicted in ...      1

[44898 rows x 2 columns]>

In [67]:
dataset.shape

(44898, 2)

### Null values

In [66]:
dataset.isnull().sum() # no null values

text     0
label    0
dtype: int64

### Balanced or Unbalanced dataset

In [68]:
dataset['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [28]:
true_df.shape # true news

(21417, 2)

In [70]:
fake_df.shape # fake news

(23481, 2)

### Shuffle or Resample

In [71]:
dataset = dataset.sample(frac = 1)

In [72]:
dataset.head(20)

Unnamed: 0,text,label
10300,WASHINGTON (Reuters) - The U.S. Republican pre...,0
11818,CAIRO (Reuters) - Islamic State has claimed an...,0
5231,Most Americans understand that we should all g...,1
6002,Bystander footage shows a group of Bakersfield...,1
19736,WASHINGTON (Reuters) - U.S. Defense Secretary ...,0
8386,WASHINGTON (Reuters) - Democratic presidential...,0
14234,WASHINGTON (Reuters) - The U.S. military said ...,0
21398,SHANGHAI (Reuters) - An old review of an acade...,0
10504,President Donald Trump and South Korea s Presi...,1
10872,WASHINGTON (Reuters) - The U.S. power sector’s...,0


In [73]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gnane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [35]:
lemmatizer = WordNetLemmatizer()

In [36]:
stopwords = stopwords.words('english')

In [37]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gnane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
def clean_data(text):
    text = text.lower() 
    row = re.sub('[^a-zA-Z]' , ' ' , text)
    token = row.split() 
    news = [lemmatizer.lemmatize(word) for word in token if not word in stopwords]
    clean_news = ' '.join(news) 
    
    return clean_news 

In [74]:
dataset['text'] = dataset['text'].apply(lambda x : clean_data(x))

In [75]:
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
vectorizer = TfidfVectorizer(max_features = 1000 , lowercase=False , ngram_range=(1,2))

In [109]:
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]

In [110]:
X.head()

10300    washington reuters u republican presidential d...
11818    cairo reuters islamic state claimed attack egy...
5231     american understand give utmost honor respect ...
6002     bystander footage show group bakersfield calif...
19736    washington reuters u defense secretary jim mat...
Name: text, dtype: object

In [111]:
y.head()

10300    0
11818    0
5231     1
6002     1
19736    0
Name: label, dtype: uint8

In [112]:
from sklearn.model_selection import train_test_split
train_X , test_X , train_y , test_y = train_test_split(X , y , test_size = 0.2 ,random_state = 0)

In [113]:
vec_train = vectorizer.fit_transform(train_X)

In [114]:
vec_test = vectorizer.transform(test_X)

In [115]:
train_data = pd.DataFrame(data = vec_train.toarray() , columns=vectorizer.get_feature_names())

In [116]:
test_data = pd.DataFrame(data = vec_test.toarray() , columns= vectorizer.get_feature_names())

## Multinomial NB

In [117]:
from sklearn.naive_bayes import MultinomialNB

In [118]:
from sklearn.metrics import accuracy_score,classification_report

In [119]:
clf = MultinomialNB()

In [120]:
clf.fit(train_data, train_y)
predictions  = clf.predict(test_data)

In [121]:
print(classification_report(test_y , predictions))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93      3335
           1       0.95      0.93      0.94      3665

    accuracy                           0.94      7000
   macro avg       0.94      0.94      0.94      7000
weighted avg       0.94      0.94      0.94      7000



Now predict on both train set

In [122]:
predictions_train = clf.predict(train_data)
pickle.dump(clf, open('model.pkl','wb'))
print(classification_report(train_y , predictions_train))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94     13321
           1       0.95      0.94      0.94     14679

    accuracy                           0.94     28000
   macro avg       0.94      0.94      0.94     28000
weighted avg       0.94      0.94      0.94     28000



In [123]:
accuracy_score(train_y , predictions_train)

0.9381071428571428

In [124]:
accuracy_score(test_y , predictions)

0.9374285714285714