# 1. Lead the data

In [82]:
import pandas as pd

news = pd.read_csv('./fake-news/train.csv')
news.shape

(20800, 5)

In [83]:
news.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [84]:
columns = ['title', 'label']
news = news[columns]
news.shape

(20800, 2)

In [85]:
news.isnull().sum()

title    558
label      0
dtype: int64

In [86]:
news = news.dropna()
news.shape

(20242, 2)

# 2. Data cleaning and preprocessing

In [87]:
from sklearn.model_selection import train_test_split

In [88]:
# converting headlines to lower case
news.replace("[^a-zA-Z]", " ", regex=True, inplace=True)
news['title'] = news['title'].str.lower()
news.head(1)

Unnamed: 0,title,label
0,house dem aide we didn t even see comey s let...,1


In [169]:
X = news.drop('label', axis=1)
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [170]:
headlines=[]
for index, row in X_train.iterrows():
    headlines.append(row['title'])
len(headlines)

14169

In [171]:
headlines[0]

're  vladimir putin'

# 3. Creating the Bag Of Words using TF-IDF

In [172]:
from sklearn.feature_extraction.text import CountVectorizer

In [173]:
countVectorizer = CountVectorizer(ngram_range=(1,3))
train_data=countVectorizer.fit_transform(headlines)

In [174]:
train_data.shape

(14169, 222116)

# 4. Implement and Evaluate Model

## MultinomialNB Algorithm

In [175]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()

In [176]:
clf.fit(train_data, y_train)

MultinomialNB()

In [177]:
test_transform = []
for index, row in X_test.iterrows():
    test_transform.append(row['title'])
test_data = countVectorizer.transform(test_transform)
predictions = clf.predict(test_data)

In [178]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [179]:
score=accuracy_score(y_test, predictions)
print(score)
confusion_mx=confusion_matrix(y_test, predictions)
print(confusion_mx)
report=classification_report(y_test, predictions)
print(report)

0.9035073275152313
[[2996  149]
 [ 437 2491]]
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      3145
           1       0.94      0.85      0.89      2928

    accuracy                           0.90      6073
   macro avg       0.91      0.90      0.90      6073
weighted avg       0.91      0.90      0.90      6073



## Passive Aggressive Classifier Algorithm

In [199]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=50)

In [200]:
linear_clf.fit(train_data, y_train)

PassiveAggressiveClassifier(max_iter=50)

In [201]:
predictions_linear_clf = linear_clf.predict(test_data)

In [202]:
score=accuracy_score(y_test, predictions_linear_clf)
print(score)
confusion_mx=confusion_matrix(y_test, predictions_linear_clf)
print(confusion_mx)
report=classification_report(y_test, predictions_linear_clf)
print(report)

0.9445084801580768
[[2869  276]
 [  61 2867]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      3145
           1       0.91      0.98      0.94      2928

    accuracy                           0.94      6073
   macro avg       0.95      0.95      0.94      6073
weighted avg       0.95      0.94      0.94      6073

