# Importing the data


In [2]:
import pandas as pd
df=pd.read_csv('fake_or_real_news.csv')

Inspection of data

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
Unnamed: 0    6335 non-null int64
title         6335 non-null object
text          6335 non-null object
label         6335 non-null object
dtypes: int64(1), object(3)
memory usage: 198.0+ KB
None


In [4]:
print(df.head())

   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


# Training CountVectorizer 

In [5]:
# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [6]:
y=df['label']
X_train,X_test,y_train,y_test=train_test_split(df['text'],y,random_state=53,test_size=0.3)

In [7]:
count_vectorizer=CountVectorizer(stop_words='english')

In [9]:
count_train=count_vectorizer.fit_transform(X_train)

In [10]:
count_test=count_vectorizer.transform(X_test)

In [11]:
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000billion', '000ft']


# Training TfidfVectorizer

In [12]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train=tfidf_vectorizer.fit_transform(X_train)
tfidf_test=tfidf_vectorizer.transform(X_test)

In [14]:
# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])
# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000billion', '000ft']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.04169599 0.         ... 0.         0.         0.        ]
 [0.         0.03144782 0.         ... 0.         0.         0.        ]
 [0.         0.01437699 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


# Inspect the vectors

In [15]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculating the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Checking whether the DataFrames are equal
print(count_df.equals(tfidf_df))

   00  000  0000  00000031  000035  00006  0001  0001pt  000billion  000ft  \
0   0    0     0         0       0      0     0       0           0      0   
1   0    3     0         0       0      0     0       0           0      0   
2   0    1     0         0       0      0     0       0           0      0   
3   0    1     0         0       0      0     0       0           0      0   
4   0    0     0         0       0      0     0       0           0      0   

   ...  حلب  عربي  عن  لم  ما  محاولات  من  هذا  والمرضى  ยงade  
0  ...    0     0   0   0   0        0   0    0        0      0  
1  ...    0     0   0   0   0        0   0    0        0      0  
2  ...    0     0   0   0   0        0   0    0        0      0  
3  ...    0     0   0   0   0        0   0    0        0      0  
4  ...    0     0   0   0   0        0   0    0        0      0  

[5 rows x 57870 columns]
    00       000  0000  00000031  000035  00006  0001  0001pt  000billion  \
0  0.0  0.000000   0.0       0.0

# Testing with CountVectorizer

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)

0.9021567596002105
[[794 119]
 [ 67 921]]


# Testing with TfidfVectorizer

In [17]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)

0.8632298790110469
[[677 236]
 [ 24 964]]


# Fine-Tuning Tfidf Method

In [19]:
import numpy as np
# Create the list of alphas: alphas
alphas = np.arange(0, 1, .1)

# Define train_and_predict()
def train_and_predict(alpha):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0
Score:  0.8858495528669121

Alpha:  0.1
Score:  0.9042609153077328

Alpha:  0.2
Score:  0.9011046817464492

Alpha:  0.30000000000000004
Score:  0.8953182535507628

Alpha:  0.4
Score:  0.8921620199894792

Alpha:  0.5
Score:  0.8884797475013151

Alpha:  0.6000000000000001


  'setting alpha = %.1e' % _ALPHA_MIN)


Score:  0.8826933193056287

Alpha:  0.7000000000000001
Score:  0.875854813256181

Alpha:  0.8
Score:  0.8695423461336139

Alpha:  0.9
Score:  0.8679642293529721

