In [1]:
#Source codes: https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [2]:
from google.colab import drive
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
import re

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
drive.mount('/content/drive')
trueNews = pd.read_csv('/content/drive/MyDrive/cs4210_ml_project/fakeRealNews_data/True.csv')
fakeNews = pd.read_csv('/content/drive/MyDrive/cs4210_ml_project/fakeRealNews_data/Fake.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Mounted at /content/drive


In [4]:
trueNews.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
fakeNews.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
# Create Label Column (As a List)
lst = []

# Add '1' for All TRUE
for x in range(trueNews.shape[0]):
    lst.append(1)
# Add '0' for All FAKE
for x in range(fakeNews.shape[0]):
    lst.append(0)

# Creating Concatenated Dataframe with TRUE, FAKE Data
data = pd.DataFrame()
data = pd.concat([trueNews , fakeNews])

# Add Label Column in Concatenated DataFrame
data['Label'] = lst
data.head()

Unnamed: 0,title,text,subject,date,Label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [7]:
# Dropping all columns except for Title, Text, & Label
data = data.drop(['subject', 'date'], 1)
data.head()

Unnamed: 0,title,text,Label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [8]:
# Combining the Title & Text Column
data['News'] = data[['title', 'text']].apply(lambda x: ', '.join(x[x.notnull()]), axis = 1)
data = data.drop(['title', 'text'], 1)
data.head()

Unnamed: 0,Label,News
0,1,"As U.S. budget fight looms, Republicans flip t..."
1,1,U.S. military to accept transgender recruits o...
2,1,Senior U.S. Republican senator: 'Let Mr. Muell...
3,1,FBI Russia probe helped by Australian diplomat...
4,1,Trump wants Postal Service to charge 'much mor...


In [9]:
# Rearranging columns
columns = data.columns.tolist()
columns = columns[-1:] + columns[:-1]
data = data[columns]
data['News'] = data['News'].str.lower()
data.head()

Unnamed: 0,News,Label
0,"as u.s. budget fight looms, republicans flip t...",1
1,u.s. military to accept transgender recruits o...,1
2,senior u.s. republican senator: 'let mr. muell...,1
3,fbi russia probe helped by australian diplomat...,1
4,trump wants postal service to charge 'much mor...,1


In [10]:
# Get English stop words
stop_words = set(stopwords.words('english'))

# Get punctuations
punctuation = list(string.punctuation)

# Stemmer
ps = PorterStemmer()

# Add extra ones that are not included in the provided one
punctuation.append('“')
punctuation.append('”')
punctuation.append('’')
punctuation.append('‘')
punctuation.append('...')

print(punctuation)

# Get the "News" from the data
initial_text_news = data["News"]
cleaned_text_news = []

# Remove punctuations, special chars, and stop words
# Stemming
for sample in initial_text_news:
  sample = re.sub('\[[^]]*\]', '', sample)
  sample = re.sub("[^a-zA-Z]"," ",sample)
  tokens = nltk.word_tokenize(sample)
  # text_no_stop_words_punct = [ps.stem(t) for t in tokens if t not in stop_words and t not in punctuation and t.isalpha()]
  text_no_stop_words_punct = [ps.stem(t) for t in tokens if t not in stop_words and t not in punctuation]
  # turn a list of token into a list of string for tf-idf vectorizer
  text_no_stop_words_punct = " ".join(text_no_stop_words_punct)
  cleaned_text_news.append(text_no_stop_words_punct)

data['News'] = cleaned_text_news

# Export the cleaned data as a csv file so that we do not have to do cleaning everytime we run the code
data.to_csv('/content/drive/MyDrive/cs4210_ml_project/fakeRealNews_data/cleaned_data.csv', index=False)

data.head()


['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '“', '”', '’', '‘', '...']


Unnamed: 0,News,Label
0,u budget fight loom republican flip fiscal scr...,1
1,u militari accept transgend recruit monday pen...,1
2,senior u republican senat let mr mueller job w...,1
3,fbi russia probe help australian diplomat tip ...,1
4,trump want postal servic charg much amazon shi...,1


In [11]:
for index, row in data.iterrows():
  if index < 5:
    print(row['News'])


u budget fight loom republican flip fiscal script washington reuter head conserv republican faction u congress vote month huge expans nation debt pay tax cut call fiscal conserv sunday urg budget restraint keep sharp pivot way among republican u repres mark meadow speak cb face nation drew hard line feder spend lawmak brace battl januari return holiday wednesday lawmak begin tri pass feder budget fight like link issu immigr polici even novemb congression elect campaign approach republican seek keep control congress presid donald trump republican want big budget increas militari spend democrat also want proport increas non defens discretionari spend program support educ scientif research infrastructur public health environment protect trump administr alreadi will say go increas non defens discretionari spend percent meadow chairman small influenti hous freedom caucu said program democrat say enough need give govern pay rais percent fiscal conserv see rational eventu run peopl money said

In [12]:
# Split the dataset into training and testing sets
train_x, test_x, train_y, test_y = model_selection.train_test_split(data["News"], data["Label"], test_size=0.3)
print(train_x)
print(test_x)

23346    snowden laugh cia excus mistakenli destroy sec...
8854     hilari meme bundi oregon standoff imag standof...
10646    republican sandov withdraw possibl suprem cour...
8966     factbox major u suprem court abort right case ...
22973    nader talebzadeh plan plan photo illustr patri...
                               ...                        
11225    mayor sanctuari citi trump tri make us fugit s...
9693     chicago mayor vow fiscal fix muni audienc chic...
15213    never forget colleg student give disturb answe...
4531     watch trump accident admit still believ obama ...
23275    fals alarm psi op lax activ shooter spectacl j...
Name: News, Length: 31428, dtype: object
20914    south korea moon say north korean provoc compl...
4543     trump say new trade order set stage manufactur...
19711    celebr hillari pay big buck perform fundrais h...
17001    dh spend million keep employe paid leav three ...
20726    take villag thug nashvil cop see black man ass...
               

In [13]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data["News"])

Train_X_Tfidf = Tfidf_vect.transform(train_x)
Test_X_Tfidf = Tfidf_vect.transform(test_x)

print(Tfidf_vect.vocabulary_)
print(Train_X_Tfidf)

{'budget': 588, 'fight': 1684, 'loom': 2622, 'republican': 3705, 'flip': 1729, 'fiscal': 1706, 'script': 3922, 'washington': 4837, 'reuter': 3742, 'head': 2017, 'conserv': 940, 'faction': 1613, 'congress': 930, 'vote': 4804, 'month': 2874, 'huge': 2118, 'expans': 1574, 'nation': 2939, 'debt': 1124, 'pay': 3234, 'tax': 4423, 'cut': 1079, 'call': 623, 'sunday': 4340, 'urg': 4718, 'keep': 2415, 'sharp': 4007, 'pivot': 3317, 'way': 4847, 'among': 165, 'repres': 3701, 'mark': 2711, 'speak': 4175, 'cb': 685, 'face': 1607, 'drew': 1348, 'hard': 1997, 'line': 2589, 'feder': 1661, 'spend': 4188, 'lawmak': 2517, 'battl': 384, 'januari': 2335, 'return': 3740, 'holiday': 2079, 'wednesday': 4859, 'begin': 406, 'tri': 4587, 'pass': 3215, 'like': 2584, 'link': 2590, 'issu': 2319, 'immigr': 2164, 'polici': 3350, 'even': 1535, 'novemb': 3033, 'congression': 931, 'elect': 1413, 'campaign': 631, 'approach': 225, 'seek': 3945, 'control': 973, 'presid': 3414, 'donald': 1320, 'trump': 4606, 'want': 4823, 'b