 **Step 1: Importing NLTK And Reading Fake And True News (.csv) Files Using Pandas**

In [None]:
import nltk
import pandas as pd
fake = pd.read_csv('Fake-210604-161841.csv')
true = pd.read_csv('True-210604-161650.csv')

#Checking the imported data
print(fake.info())
print('\n')
print(true.info())
print('\n')
print(fake.head())
print('\n')
print(true.head())
print('\n')
print(fake.value_counts())
print('\n')
print(true.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None


                                               title  ...               date
0   Donald Trump Sends Out Embarrassing New Year’...  ...  December 31, 2017
1   Drunk Bragging Trump Staffer Started Russian ...  ...  December 31, 2017
2   Sheriff

**Step 2: Adding Target Columns With Values 0 And 1 To The Fake And True News Respectively**

In [None]:
fake['target'] = 0
true['target'] = 1

#Checking the added columns 
print(fake.head())
print('\n')
print(true.head())

                                               title  ... target
0   Donald Trump Sends Out Embarrassing New Year’...  ...      0
1   Drunk Bragging Trump Staffer Started Russian ...  ...      0
2   Sheriff David Clarke Becomes An Internet Joke...  ...      0
3   Trump Is So Obsessed He Even Has Obama’s Name...  ...      0
4   Pope Francis Just Called Out Donald Trump Dur...  ...      0

[5 rows x 5 columns]


                                               title  ... target
0  As U.S. budget fight looms, Republicans flip t...  ...      1
1  U.S. military to accept transgender recruits o...  ...      1
2  Senior U.S. Republican senator: 'Let Mr. Muell...  ...      1
3  FBI Russia probe helped by Australian diplomat...  ...      1
4  Trump wants Postal Service to charge 'much mor...  ...      1

[5 rows x 5 columns]


**Step 3: Concatenating The Two Datasets, Fixing Indices And Dropping Unnecessary Columns**

In [None]:
news = pd.concat([true,fake], axis=0)
news = news.reset_index(drop=True)
news = news.drop(['subject', 'date', 'title'], axis=1)

#Checking the columns of the new dataset
print(news.columns)

Index(['text', 'target'], dtype='object')


**Step 4: Tokenization**

In [None]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize
news['text']= news['text'].apply(word_tokenize)

#Checking data after tokenization
print(news.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
                                                text  target
0  [WASHINGTON, (, Reuters, ), -, The, head, of, ...       1
1  [WASHINGTON, (, Reuters, ), -, Transgender, pe...       1
2  [WASHINGTON, (, Reuters, ), -, The, special, c...       1
3  [WASHINGTON, (, Reuters, ), -, Trump, campaign...       1
4  [SEATTLE/WASHINGTON, (, Reuters, ), -, Preside...       1


**Step 5: Stemming**

In [None]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english', ignore_stopwords=False)
def stemmer(text):
  return [porter.stem(word) for word in text]
news['text'] = news['text'].apply(stemmer)

#Checking the stemmed data
print(news.head())

                                                text  target
0  [washington, (, reuter, ), -, the, head, of, a...       1
1  [washington, (, reuter, ), -, transgend, peopl...       1
2  [washington, (, reuter, ), -, the, special, co...       1
3  [washington, (, reuter, ), -, trump, campaign,...       1
4  [seattle/washington, (, reuter, ), -, presid, ...       1


**Step 6: Stopwords Removal**

In [None]:
def stopper(text):
  dt = [word for word in text if len(word)>2]
  return dt
news['text'] = news['text'].apply(stopper)

#Checking data after removing stopwords
print(news.head())

                                                text  target
0  [washington, reuter, the, head, conserv, repub...       1
1  [washington, reuter, transgend, peopl, will, a...       1
2  [washington, reuter, the, special, counsel, in...       1
3  [washington, reuter, trump, campaign, advis, g...       1
4  [seattle/washington, reuter, presid, donald, t...       1


**Step 7: Spilitting The Data Into Training And Testing Data**

In [None]:
news['text'] = news['text'].apply(' '.join)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(news['text'], news['target'], test_size=0.25)

#Checking the training and testing datasets
print(x_train.head())
print('\n')
print(y_train.head())
print('\n')
print(x_test.head())
print('\n')
print(y_test.head())

40724    skip sharpton march washington saturday the pr...
15193    danang vietnam reuter u.s. presid donald trump...
10320    washington reuter presid barack obama nomin ve...
27884    muslim high school student rancho cucamonga ca...
35111    socialist indoctrin start kindergarten and rei...
Name: text, dtype: object


40724    0
15193    1
10320    1
27884    0
35111    0
Name: target, dtype: int64


3736     washington reuter milwauke counti sheriff davi...
21017    xiamen china reuter bric countri should deepen...
7574     washington reuter democrat presidenti candid h...
18688    erbil iraq reuter ban intern flight iraqi kurd...
40561    singer/songwrit joy villa decid follow the pol...
Name: text, dtype: object


3736     1
21017    1
7574     1
18688    1
40561    0
Name: target, dtype: int64


**Step 8: Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
my_tfidf = TfidfVectorizer(max_df=0.7)
tfidf_train = my_tfidf.fit_transform(x_train)
tfidf_test = my_tfidf.transform(x_test)

#Viewing the document term matrix of training data after vectorization
print(tfidf_train)

  (0, 78302)	0.02639949354700941
  (0, 29452)	0.029007530934430346
  (0, 11656)	0.08662189883644751
  (0, 7820)	0.0288676447490267
  (0, 982)	0.04327082444480126
  (0, 687)	0.039535893810467945
  (0, 11024)	0.023656715988710345
  (0, 67752)	0.050527426453758115
  (0, 81577)	0.011678492645503526
  (0, 1504)	0.03652324513688796
  (0, 17306)	0.024517429770502946
  (0, 88100)	0.03191363882299133
  (0, 72616)	0.014668565709089317
  (0, 82443)	0.045505826084730125
  (0, 49584)	0.02945929094361015
  (0, 50594)	0.031978251022845856
  (0, 7400)	0.013415757446486017
  (0, 60358)	0.022564007624694207
  (0, 87972)	0.05357289520367029
  (0, 83140)	0.0208126312382209
  (0, 77031)	0.025208114084811377
  (0, 50393)	0.04356627414628189
  (0, 77003)	0.08662189883644751
  (0, 74993)	0.01976373829343635
  (0, 32729)	0.0428502243569742
  :	:
  (33672, 69480)	0.05471277442949666
  (33672, 44772)	0.09246584296610334
  (33672, 25911)	0.04138668987133746
  (33672, 33483)	0.03489224397446229
  (33672, 61466)	0.

**Step 9: Logistic Regression (Algorithm 1)**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
model_1 = LogisticRegression(max_iter=900)
model_1.fit(tfidf_train, y_train)
pred_1 = model_1.predict(tfidf_test)
ac1 = accuracy_score(y_test, pred_1)
print("The accuracy of the predicitions by Logistic Regression is: ", ac1*100)

The accuracy of the predictions by Logistic Regression is:  98.913140311804


**Step 10: Passive Aggressive Classifier (Algorithm 2)**

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
model_2 = PassiveAggressiveClassifier(max_iter=50)
model_2.fit(tfidf_train, y_train)
pred_2 = model_2.predict(tfidf_test)
ac2 = accuracy_score(y_test, pred_2)
print("The accuracy of the predictions by the Passive Aggressive Classifier is: ", ac2*100)

The accuracy of the predictions by the Passive Aggressive Classifier is:  99.70601336302896
