# 1. Lead the data

In [1]:
import pandas as pd

news = pd.read_csv('./fake-news/train.csv')
news.shape

(20800, 5)

In [2]:
news.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
columns = ['title', 'label']
news = news[columns]
news.shape

(20800, 2)

In [4]:
news.isnull().sum()

title    558
label      0
dtype: int64

In [5]:
news = news.dropna()
news.shape

(20242, 2)

# 2. Data cleaning and preprocessing

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# converting headlines to lower case
news.replace("[^a-zA-Z]", " ", regex=True, inplace=True)
news['title'] = news['title'].str.lower()
news.head(1)

Unnamed: 0,title,label
0,house dem aide we didn t even see comey s let...,1


In [8]:
X = news.drop('label', axis=1)
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
headlines=[]
for index, row in X_train.iterrows():
    headlines.append(row['title'])
len(headlines)

14169

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for headline in headlines:
    review = re.sub('[^a-zA-Z]', ' ', headline)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
corpus[0]

'ufo investig max spier found dead vomit black liquid'

# 3. Creating the TF-IDF model

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
train_data = cv.fit_transform(corpus).toarray()

In [13]:
train_data.shape

(14169, 12558)

# 4. Implement and Evaluate Model

## MultinomialNB Algorithm

In [14]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()

In [15]:
clf.fit(train_data, y_train)

MultinomialNB()

In [16]:
headlines_test = []
for index, row in X_test.iterrows():
    headlines_test.append(row['title'])

In [17]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
test_transform = []
for headline_test in headlines_test:
    review = re.sub('[^a-zA-Z]', ' ', headline_test)
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    test_transform.append(review)

In [18]:
test_data = cv.transform(test_transform)
predictions = clf.predict(test_data)

In [19]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [20]:
score=accuracy_score(y_test, predictions)
print(score)
confusion_mx=confusion_matrix(y_test, predictions)
print(confusion_mx)
report=classification_report(y_test, predictions)
print(report)

0.875349909435205
[[2960  148]
 [ 609 2356]]
              precision    recall  f1-score   support

           0       0.83      0.95      0.89      3108
           1       0.94      0.79      0.86      2965

    accuracy                           0.88      6073
   macro avg       0.89      0.87      0.87      6073
weighted avg       0.88      0.88      0.87      6073



## Passive Aggressive Classifier Algorithm

In [21]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=50)

In [22]:
linear_clf.fit(train_data, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [23]:
predictions_linear_clf = linear_clf.predict(test_data)

In [24]:
score=accuracy_score(y_test, predictions_linear_clf)
print(score)
confusion_mx=confusion_matrix(y_test, predictions_linear_clf)
print(confusion_mx)
report=classification_report(y_test, predictions_linear_clf)
print(report)

0.929853449695373
[[2854  254]
 [ 172 2793]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3108
           1       0.92      0.94      0.93      2965

    accuracy                           0.93      6073
   macro avg       0.93      0.93      0.93      6073
weighted avg       0.93      0.93      0.93      6073



## Multinomial Classifier with Hyperparameter

In [25]:
classifier=MultinomialNB(alpha=0.1)

In [26]:
import numpy as np

previous_score=0
for alpha in np.arange(0,1,0.1):
    sub_classifier=MultinomialNB(alpha=alpha)
    sub_classifier.fit(train_data,y_train)
    y_pred=sub_classifier.predict(test_data)
    score = accuracy_score(y_test, y_pred)
    if score>previous_score:
        classifier=sub_classifier
    print("Alpha: {}, Score : {}".format(alpha,score))



Alpha: 0.0, Score : 0.8191997365387782
Alpha: 0.1, Score : 0.8605302157088753
Alpha: 0.2, Score : 0.8662934299357813
Alpha: 0.30000000000000004, Score : 0.8692573686810473
Alpha: 0.4, Score : 0.8723859706899391
Alpha: 0.5, Score : 0.8732092870080685
Alpha: 0.6000000000000001, Score : 0.8740326033261979
Alpha: 0.7000000000000001, Score : 0.8750205829079533
Alpha: 0.8, Score : 0.8746912563807014
Alpha: 0.9, Score : 0.8750205829079533
