# 1. Lead the data

In [31]:
import pandas as pd

news = pd.read_csv('./fake-news/train.csv')
news.shape

(20800, 5)

In [32]:
news.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [33]:
columns = ['title', 'label']
news = news[columns]
news.shape

(20800, 2)

In [34]:
news.isnull().sum()

title    558
label      0
dtype: int64

In [35]:
news = news.dropna()
news.shape

(20242, 2)

# 2. Data cleaning and preprocessing

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
# converting headlines to lower case
news.replace("[^a-zA-Z]", " ", regex=True, inplace=True)
news['title'] = news['title'].str.lower()
news.head(1)

Unnamed: 0,title,label
0,house dem aide we didn t even see comey s let...,1


In [38]:
X = news.drop('label', axis=1)
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
headlines=[]
for index, row in X_train.iterrows():
    headlines.append(row['title'])
len(headlines)

16193

In [40]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for headline in headlines:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [41]:
corpus[0]

'trump ahead zero nation poll media continu close race narr'

# 3. Creating the Bag Of Words using CountVectorizer

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
countVectorizer = CountVectorizer(ngram_range=(1,3))
train_data=countVectorizer.fit_transform(corpus)

In [44]:
train_data.shape

(16193, 188488)

# 4. Implement and Evaluate Model

## MultinomialNB Algorithm

In [45]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()

In [46]:
clf.fit(train_data, y_train)

MultinomialNB()

In [56]:
headlines_test = []
for index, row in X_test.iterrows():
    headlines_test.append(row['title'])

In [57]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
test_transform = []
for headline_test in headlines_test:
    review = re.sub('[^a-zA-Z]', ' ', headline_test)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    test_transform.append(review)

In [58]:
test_data = countVectorizer.transform(test_transform)
predictions = clf.predict(test_data)

In [59]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [60]:
score=accuracy_score(y_test, predictions)
print(score)
confusion_mx=confusion_matrix(y_test, predictions)
print(confusion_mx)
report=classification_report(y_test, predictions)
print(report)

0.8938009385033342
[[2019   94]
 [ 336 1600]]
              precision    recall  f1-score   support

           0       0.86      0.96      0.90      2113
           1       0.94      0.83      0.88      1936

    accuracy                           0.89      4049
   macro avg       0.90      0.89      0.89      4049
weighted avg       0.90      0.89      0.89      4049



## Passive Aggressive Classifier Algorithm

In [61]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=50)

In [62]:
linear_clf.fit(train_data, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [63]:
predictions_linear_clf = linear_clf.predict(test_data)

In [64]:
score=accuracy_score(y_test, predictions_linear_clf)
print(score)
confusion_mx=confusion_matrix(y_test, predictions_linear_clf)
print(confusion_mx)
report=classification_report(y_test, predictions_linear_clf)
print(report)

0.9394912324030625
[[1907  206]
 [  39 1897]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94      2113
           1       0.90      0.98      0.94      1936

    accuracy                           0.94      4049
   macro avg       0.94      0.94      0.94      4049
weighted avg       0.94      0.94      0.94      4049



## Multinomial Classifier with Hyperparameter

In [65]:
classifier=MultinomialNB(alpha=0.1)

In [66]:
import numpy as np

previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(train_data,y_train)
    y_pred=sub_classifier.predict(test_data)
    score = accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))



Alpha: 0.0, Score : 0.8639170165472956
Alpha: 0.1, Score : 0.8930600148184737
Alpha: 0.2, Score : 0.8970116078043961
Alpha: 0.30000000000000004, Score : 0.8970116078043961
Alpha: 0.4, Score : 0.8962706841195357
Alpha: 0.5, Score : 0.8965176586811558
Alpha: 0.6000000000000001, Score : 0.896764633242776
Alpha: 0.7000000000000001, Score : 0.8962706841195357
Alpha: 0.8, Score : 0.895035811311435
Alpha: 0.9, Score : 0.8942948876265745
