In [None]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer, word_tokenize, TweetTokenizer
from nltk import FreqDist
import re
import string
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('df_clean.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,caps_in_title
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,fake,11
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,fake,8
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,fake,15
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,fake,19
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,fake,11
...,...,...,...,...,...,...,...
38633,38639,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,2017-08-22,true,8
38634,38640,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,2017-08-22,true,3
38635,38641,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,2017-08-22,true,1
38636,38642,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,2017-08-22,true,4


#### Clean Title and Text Data

In [None]:
def concat_lists_of_strings(df, column):
    """Concatenate a series of lists of strings from a column in a dataframe"""
    return [x for list_ in df[column].values for x in list_]

In [None]:
def find_strings(string_, regex):
    """Find and Return a list of URLs in the input string"""
    list_ = re.findall(regex, string_)
    return [s[0] for s in list_]

In [None]:
def freq_dist_of_col(df, col):
    """Return a Frequency Distribution of a column"""
    corpus_tokens = concat_lists_of_strings(df, col)
    corpus_freq_dist = FreqDist(corpus_tokens)
    print(f'The number of unique tokens in the corpus is {len(corpus_freq_dist)}')
    return corpus_freq_dist

In [None]:
def review_freq_dis(df, col, n):
    """
    Create a Frequency Distribution of a column of a dataframe and display
    the n most common tokens.
    """
    corpus_freq_dist = freq_dist_of_col(df, col)
    display(corpus_freq_dist.most_common(n))

In [None]:
def remove_punctuation(word_list, punctuation_list):
    """Remove punctuation tokens from a list of tokens"""
    return [w for w in word_list if w not in punctuation_list]

In [None]:
def remove_single_characters(word_list, exception_list):
    """Remove all the single characters, except those on the exception list"""
    return [w for w in word_list if (len(w) > 1 or w in exception_list)]

In [None]:
def remove_words(word_list, words_to_remove):
    """Remove all the words in the words_to_remove list from the words_list"""
    return [w for w in word_list if w not in words_to_remove]

#### Rough look at token frequency distribution

In [None]:

tknzr = RegexpTokenizer(r'\w+|\$[\d\.]+|\([@\w\d]+\)')

In [None]:
df['text_tokens'] = df['text'].apply(tknzr.tokenize)

In [None]:
corpus_freq_dist = freq_dist_of_col(df, 'text_tokens')

The number of unique tokens in the corpus is 152402


In [None]:
corpus_freq_dist.most_common(150)

[('the', 771208),
 ('to', 457849),
 ('of', 372801),
 ('a', 339364),
 ('and', 336802),
 ('in', 282041),
 ('s', 198810),
 ('that', 195661),
 ('on', 162850),
 ('for', 144241),
 ('is', 138935),
 ('said', 122504),
 ('Trump', 121295),
 ('with', 98710),
 ('The', 98576),
 ('was', 96440),
 ('he', 95085),
 ('it', 88450),
 ('as', 82843),
 ('his', 80861),
 ('by', 79953),
 ('has', 78027),
 ('be', 71600),
 ('have', 70857),
 ('not', 68321),
 ('from', 67728),
 ('are', 60493),
 ('at', 59290),
 ('who', 58081),
 ('an', 56078),
 ('I', 51915),
 ('this', 50639),
 ('U', 50227),
 ('would', 49247),
 ('S', 47546),
 ('they', 47357),
 ('t', 44464),
 ('will', 42664),
 ('about', 41975),
 ('had', 40196),
 ('their', 38467),
 ('been', 36588),
 ('but', 36488),
 ('people', 34367),
 ('were', 34078),
 ('which', 33115),
 ('or', 32819),
 ('we', 32590),
 ('more', 31681),
 ('you', 31398),
 ('President', 31135),
 ('out', 29155),
 ('her', 28292),
 ('after', 28290),
 ('one', 27682),
 ('all', 26913),
 ('its', 26839),
 ('also', 26

In [None]:
len([w for w in corpus_freq_dist.most_common() if w[1] == 1]) #How many tokens were only used once?

67886

#### Investigate URLs in the text

In [None]:

URL_REGEX = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [None]:
df['text_urls'] = df['text'].apply(lambda x: find_strings(x, URL_REGEX))

In [None]:
urls_in_text = concat_lists_of_strings(df, 'text_urls')

In [None]:
urls_in_text

['pic.twitter.com/4FPAe2KypA',
 'pic.twitter.com/XtZW5PdU2b',
 'pic.twitter.com/T2NY2psHCR',
 'https://t.co/zcbyc4Wp5b',
 'pic.twitter.com/fWfXsZupxy',
 '2017pic.twitter.com/ymsOBLjfxU',
 'pic.twitter.com/dWr5k8ZEZV',
 'pic.twitter.com/ulCFddhkdy',
 'https://t.co/Fg7VacxRtJ',
 'pic.twitter.com/5gEMcjQTbH',
 'https://t.co/zrWpyMXRcz',
 'pic.twitter.com/wiQSQNNzw0',
 'https://t.co/dkhw0AlHB4',
 'pic.twitter.com/oaZDT126B3',
 'https://t.co/ayBlGmk65Z',
 'pic.twitter.com/Z7dmyQ5smy',
 'pic.twitter.com/8TKtrMqRa1',
 'pic.twitter.com/hIxs3DciO8',
 'pic.twitter.com/E5bmcI83mU',
 'pic.twitter.com/a45En9Jwys',
 'pic.twitter.com/yLCBmhpNvG',
 'pic.twitter.com/3vMZUTEylx',
 'https://t.co/XrOvu32EV8',
 'pic.twitter.com/eMP9UX1bM8',
 'pic.twitter.com/XViyKFQCET',
 'https://t.co/HFYJRkefJ1',
 'https://t.co/65FhbQHuV4',
 'https://t.co/7lHYkIloyz',
 'https://t.co/g8SwgAKtfH',
 'https://t.co/9lCqpYujKN',
 'pic.twitter.com/NXEX9rGBgu',
 'pic.twitter.com/QePW9FtbSh',
 'pic.twitter.com/mUbKCIWGxB',
 'pic.

In [None]:

len(urls_in_text)

9648

The URLs look to all be links to Twitter images, probably profile photos.
We think it would be of more potential significance if there are links in a story rather than what that link is, so we will convert all the urls to the placeholder {link}.

In [None]:
df['clean_text'] = df['text'].apply(lambda x: re.sub(URL_REGEX, '{link}', x))

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,caps_in_title,text_tokens,text_urls,clean_text
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,fake,11,"[Donald, Trump, just, couldn, t, wish, all, Am...",[pic.twitter.com/4FPAe2KypA],Donald Trump just couldn t wish all Americans ...
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,fake,8,"[House, Intelligence, Committee, Chairman, Dev...",[],House Intelligence Committee Chairman Devin Nu...
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,fake,15,"[On, Friday, it, was, revealed, that, former, ...","[pic.twitter.com/XtZW5PdU2b, pic.twitter.com/T...","On Friday, it was revealed that former Milwauk..."
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,fake,19,"[On, Christmas, day, Donald, Trump, announced,...","[https://t.co/Fg7VacxRtJ, pic.twitter.com/5gEM...","On Christmas day, Donald Trump announced that ..."
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,fake,11,"[Pope, Francis, used, his, annual, Christmas, ...",[],Pope Francis used his annual Christmas Day mes...


#### Investigate URLs in title

In [None]:
df['title_urls'] = df['title'].apply(lambda x: find_strings(x, URL_REGEX))
urls_in_titles = concat_lists_of_strings(df, 'title_urls')
urls_in_titles

[]

There are no URLs in the title field

#### Investigate Twitter handles in Text

In [None]:
TWITTER_HANDLE_REGEX = r'(?<=^|(?<=[^\w]))(@\w{1,15})\b'

In [None]:
df['twitter_handles'] = df['clean_text'].apply(lambda x: re.findall(TWITTER_HANDLE_REGEX, x))

In [None]:
twitter_handles = concat_lists_of_strings(df, 'twitter_handles')

In [None]:
twitter_freq_dist = FreqDist(twitter_handles)

In [None]:

twitter_freq_dist.most_common(50)

[('@realDonaldTrump', 2997),
 ('@POTUS', 345),
 ('@21WIRE', 283),
 ('@FoxNews', 233),
 ('@HillaryClinton', 198),
 ('@seanhannity', 176),
 ('@CNN', 170),
 ('@nytimes', 126),
 ('@foxandfriends', 101),
 ('@elizabethforma', 98),
 ('@NBCNews', 93),
 ('@JordanUhl', 87),
 ('@tonyposnanski', 86),
 ('@ABC', 81),
 ('@AnnCoulter', 71),
 ('@bessbell', 69),
 ('@PressSec', 66),
 ('@FLOTUS', 65),
 ('@IvankaTrump', 59),
 ('@WalshFreedom', 53),
 ('@WhiteHouse', 52),
 ('@BernieSanders', 52),
 ('@KellyannePolls', 51),
 ('@PrisonPlanet', 49),
 ('@BraddJaffy', 49),
 ('@SpeakerRyan', 48),
 ('@realdonaldtrump', 48),
 ('@DonaldJTrumpJr', 47),
 ('@marcorubio', 46),
 ('@SarahPalinUSA', 45),
 ('@ABCPolitics', 44),
 ('@BarackObama', 43),
 ('@CNNPolitics', 39),
 ('@EricTrump', 39),
 ('@MSNBC', 39),
 ('@Morning_Joe', 39),
 ('@joshdcaplan', 39),
 ('@GOP', 38),
 ('@CBSNews', 38),
 ('@JackPosobiec', 38),
 ('@Cernovich', 37),
 ('@kylegriffin1', 36),
 ('@mike_pence', 36),
 ('@MatthewDicks', 36),
 ('@tedcruz', 36),
 ('@w

In [None]:
# total number of Twitter handles
len(twitter_handles)

22224

In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(TWITTER_HANDLE_REGEX, '@twitter-handle', x))

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,caps_in_title,text_tokens,text_urls,clean_text,title_urls,twitter_handles
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,fake,11,"[Donald, Trump, just, couldn, t, wish, all, Am...",[pic.twitter.com/4FPAe2KypA],Donald Trump just couldn t wish all Americans ...,[],"[@realDonaldTrump, @TalbertSwan, @calvinstowel..."
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,fake,8,"[House, Intelligence, Committee, Chairman, Dev...",[],House Intelligence Committee Chairman Devin Nu...,[],[]
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,fake,15,"[On, Friday, it, was, revealed, that, former, ...","[pic.twitter.com/XtZW5PdU2b, pic.twitter.com/T...","On Friday, it was revealed that former Milwauk...",[],"[@SheriffClarke, @SheriffClarke, @KeithLeBlanc..."
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,fake,19,"[On, Christmas, day, Donald, Trump, announced,...","[https://t.co/Fg7VacxRtJ, pic.twitter.com/5gEM...","On Christmas day, Donald Trump announced that ...",[],"[@pbump, @_cingraham, @_cingraham, @_cingraham..."
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,fake,11,"[Pope, Francis, used, his, annual, Christmas, ...",[],Pope Francis used his annual Christmas Day mes...,[],[]


#### Capitalization

In [None]:
def lower_unless_all_caps(string_):
    """
    Make all words in the input string lowercase unless that 
    word is in all caps
    """
    words = string_.split()
    processed_words = [w.lower() if not (w.isupper() and len(w) > 1) else w for w in words]
    return ' '.join(processed_words)

In [None]:
df['clean_text'] = df['clean_text'].apply(lower_unless_all_caps)

In [None]:
df['clean_title'] = df['title'].apply(lower_unless_all_caps)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,caps_in_title,text_tokens,text_urls,clean_text,title_urls,twitter_handles,clean_title
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,fake,11,"[Donald, Trump, just, couldn, t, wish, all, Am...",[pic.twitter.com/4FPAe2KypA],donald trump just couldn t wish all americans ...,[],"[@realDonaldTrump, @TalbertSwan, @calvinstowel...",donald trump sends out embarrassing new year’s...
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,fake,8,"[House, Intelligence, Committee, Chairman, Dev...",[],house intelligence committee chairman devin nu...,[],[],drunk bragging trump staffer started russian c...
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,fake,15,"[On, Friday, it, was, revealed, that, former, ...","[pic.twitter.com/XtZW5PdU2b, pic.twitter.com/T...","on friday, it was revealed that former milwauk...",[],"[@SheriffClarke, @SheriffClarke, @KeithLeBlanc...",sheriff david clarke becomes an internet joke ...
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,fake,19,"[On, Christmas, day, Donald, Trump, announced,...","[https://t.co/Fg7VacxRtJ, pic.twitter.com/5gEM...","on christmas day, donald trump announced that ...",[],"[@pbump, @_cingraham, @_cingraham, @_cingraham...",trump is so obsessed he even has obama’s name ...
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,fake,11,"[Pope, Francis, used, his, annual, Christmas, ...",[],pope francis used his annual christmas day mes...,[],[],pope francis just called out donald trump duri...


#### Numbers

The only number/date that is important is 9/11 so we will change it to nine-eleven so that numbers can more easily be removed.
Also, we will replace the numbers with a space because some of the sentences run together and end with a number. Replacing the number with a space will split the sentences.

In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [None]:

df['clean_title'] = df['clean_title'].apply(lambda x: re.sub(r'9\/11', 'nine-eleven', x))

In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'\d+', ' ', x))

In [None]:
df['clean_title'] = df['clean_title'].apply(lambda x: re.sub(r'\d+', ' ', x))

#### Rough Look at the tokens from the current clean_text

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\konst\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
df['clean_text_tokens'] = df['clean_text'].apply(word_tokenize)

In [None]:
review_freq_dis(df, 'clean_text_tokens', 150)

The number of unique tokens in the corpus is 201119


[('the', 859850),
 (',', 779608),
 ('.', 545038),
 ('to', 458899),
 ('of', 373564),
 ('a', 352458),
 ('and', 345523),
 ('in', 297929),
 ('that', 203739),
 ('s', 199176),
 ('on', 166851),
 ('for', 147198),
 ('is', 139896),
 ('said', 120761),
 ('trump', 117664),
 ('he', 117178),
 ('it', 110546),
 ('with', 100697),
 ('was', 96714),
 ('as', 88050),
 ('his', 84136),
 ('by', 80892),
 ('has', 78137),
 ('not', 72455),
 ('be', 71205),
 ('have', 71038),
 ('’', 70279),
 ('from', 68567),
 (')', 68006),
 ('(', 67765),
 ('this', 62810),
 ('at', 62594),
 ('are', 61005),
 ('who', 58866),
 ('an', 57936),
 ('they', 57188),
 ('“', 53766),
 ('”', 53489),
 ('but', 50924),
 ('we', 50799),
 (':', 49644),
 ('would', 49324),
 ('i', 46703),
 ('president', 45340),
 ('U.S.', 43292),
 ('will', 43158),
 ('about', 42409),
 ('t', 40472),
 ('had', 40273),
 ('their', 39205),
 ('you', 37857),
 ('been', 36559),
 ('people', 35603),
 ('-', 34196),
 ('were', 34161),
 ('which', 33413),
 ('or', 33116),
 ('more', 32885),
 ('af

#### Remove (reuters) from news stories 

Almost all the True news stories have (Reuters) at their beginning and a ML model would merely learn that as how to distinguish Fake vs True, which would overfit potential models to this dataset.

In [None]:
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r'\(reuters\)', ' ', x))

#### Update Tokens

In [None]:
df['clean_text_tokens'] = df['clean_text'].apply(word_tokenize)

In [None]:
review_freq_dis(df, 'clean_text_tokens', 150)

The number of unique tokens in the corpus is 201119


[('the', 859850),
 (',', 779608),
 ('.', 545038),
 ('to', 458899),
 ('of', 373564),
 ('a', 352458),
 ('and', 345523),
 ('in', 297929),
 ('that', 203739),
 ('s', 199176),
 ('on', 166851),
 ('for', 147198),
 ('is', 139896),
 ('said', 120761),
 ('trump', 117664),
 ('he', 117178),
 ('it', 110546),
 ('with', 100697),
 ('was', 96714),
 ('as', 88050),
 ('his', 84136),
 ('by', 80892),
 ('has', 78137),
 ('not', 72455),
 ('be', 71205),
 ('have', 71038),
 ('’', 70279),
 ('from', 68567),
 ('this', 62810),
 ('at', 62594),
 ('are', 61005),
 ('who', 58866),
 ('an', 57936),
 ('they', 57188),
 ('“', 53766),
 ('”', 53489),
 ('but', 50924),
 ('we', 50799),
 (':', 49644),
 ('would', 49324),
 (')', 46974),
 ('(', 46733),
 ('i', 46703),
 ('president', 45340),
 ('U.S.', 43292),
 ('will', 43158),
 ('about', 42409),
 ('t', 40472),
 ('had', 40273),
 ('their', 39205),
 ('you', 37857),
 ('been', 36559),
 ('people', 35603),
 ('-', 34196),
 ('were', 34161),
 ('which', 33413),
 ('or', 33116),
 ('more', 32885),
 ('af

#### Remove Punctuation and Single Letter Tokens from Text

We remove the Punctuation tokens except for the exclamation point. Also, we will remove all the single characters except for i.

In [None]:
df['clean_text_tokens'] = df['clean_text_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [None]:
review_freq_dis(df, 'clean_text_tokens', 150)

The number of unique tokens in the corpus is 201028


[('the', 859850),
 ('to', 458899),
 ('of', 373564),
 ('and', 345523),
 ('in', 297929),
 ('that', 203739),
 ('on', 166851),
 ('for', 147198),
 ('is', 139896),
 ('said', 120761),
 ('trump', 117664),
 ('he', 117178),
 ('it', 110546),
 ('with', 100697),
 ('was', 96714),
 ('as', 88050),
 ('his', 84136),
 ('by', 80892),
 ('has', 78137),
 ('not', 72455),
 ('be', 71205),
 ('have', 71038),
 ('from', 68567),
 ('this', 62810),
 ('at', 62594),
 ('are', 61005),
 ('who', 58866),
 ('an', 57936),
 ('they', 57188),
 ('but', 50924),
 ('we', 50799),
 ('would', 49324),
 ('i', 46703),
 ('president', 45340),
 ('U.S.', 43292),
 ('will', 43158),
 ('about', 42409),
 ('had', 40273),
 ('their', 39205),
 ('you', 37857),
 ('been', 36559),
 ('people', 35603),
 ('were', 34161),
 ('which', 33413),
 ('or', 33116),
 ('more', 32885),
 ('after', 31622),
 ('she', 30883),
 ('one', 30122),
 ('if', 30087),
 ('her', 29022),
 ('state', 28335),
 ('out', 28327),
 ('all', 27998),
 ('what', 27420),
 ('its', 27295),
 ('also', 26968

#### Tokenize Clean Title

In [None]:
df['clean_title_tokens'] = df['clean_title'].apply(word_tokenize)

In [None]:
df.iloc[1000:1010]['clean_title_tokens']

1000    [stunning, new, poll, reveals, global, opinion...
1001    [former, GOP, rep, throws, support, behind, ob...
1002    [trump, moronically, claims, entire, russia, i...
1003    [WATCH, :, it, just, hit, a, trump, supporter,...
1004    [republicans, just, added, what, they, hated, ...
1005    [the, numbers, are, in, and, jon, ossoff, losi...
1006    [GOP, senator, lashes, into, kellyanne, conway...
1007    [cops, in, republican, senate, office, violent...
1008    [trump, is, giddy, about, his, upcoming, meeti...
1009    [catholic, sister, rips, ‘, pro-life, ’, repub...
Name: clean_title_tokens, dtype: object

In [None]:
review_freq_dis(df, 'clean_title_tokens', 150)

The number of unique tokens in the corpus is 31502


[('to', 14060),
 ('trump', 11939),
 (':', 10879),
 ('’', 10184),
 (',', 8742),
 ('in', 7536),
 ('of', 6083),
 ('for', 6043),
 ('on', 5630),
 ('the', 4638),
 ('s', 4576),
 ('U.S.', 4157),
 ("'s", 4120),
 ('(', 4101),
 (')', 4101),
 ('VIDEO', 3992),
 ('says', 3347),
 ('a', 3273),
 ('with', 3134),
 ('”', 3080),
 ('[', 3035),
 (']', 3034),
 ('and', 2943),
 ('“', 2847),
 ('‘', 2838),
 ("'", 2619),
 ('is', 2263),
 ('!', 2157),
 ('after', 2114),
 ('video', 1957),
 ('obama', 1807),
 ('house', 1794),
 ('at', 1774),
 ('as', 1763),
 ('over', 1694),
 ('from', 1551),
 ('he', 1533),
 ('his', 1528),
 ('by', 1484),
 ('about', 1455),
 ('white', 1377),
 ('new', 1368),
 ('it', 1356),
 ('will', 1312),
 ('not', 1309),
 ('WATCH', 1267),
 ('clinton', 1202),
 ('?', 1174),
 ('russia', 1142),
 ('just', 1133),
 ('t', 1125),
 ('be', 1122),
 ('president', 1117),
 ('hillary', 1059),
 ('bill', 1027),
 ('republican', 992),
 ('north', 986),
 ('korea', 912),
 ('that', 890),
 ('this', 888),
 ('*', 881),
 ('senate', 878)

#### Remove Punctuation and Single Letter Tokens from Clean Title

In [None]:
df['clean_title_tokens'] = df['clean_title_tokens'].apply(lambda x: remove_single_characters(x, ['i', '!']))

In [None]:
review_freq_dis(df, 'clean_title_tokens', 150)

The number of unique tokens in the corpus is 31424


[('to', 14060),
 ('trump', 11939),
 ('in', 7536),
 ('of', 6083),
 ('for', 6043),
 ('on', 5630),
 ('the', 4638),
 ('U.S.', 4157),
 ("'s", 4120),
 ('VIDEO', 3992),
 ('says', 3347),
 ('with', 3134),
 ('and', 2943),
 ('is', 2263),
 ('!', 2157),
 ('after', 2114),
 ('video', 1957),
 ('obama', 1807),
 ('house', 1794),
 ('at', 1774),
 ('as', 1763),
 ('over', 1694),
 ('from', 1551),
 ('he', 1533),
 ('his', 1528),
 ('by', 1484),
 ('about', 1455),
 ('white', 1377),
 ('new', 1368),
 ('it', 1356),
 ('will', 1312),
 ('not', 1309),
 ('WATCH', 1267),
 ('clinton', 1202),
 ('russia', 1142),
 ('just', 1133),
 ('be', 1122),
 ('president', 1117),
 ('hillary', 1059),
 ('bill', 1027),
 ('republican', 992),
 ('north', 986),
 ('korea', 912),
 ('that', 890),
 ('this', 888),
 ('senate', 878),
 ('out', 876),
 ('state', 868),
 ('court', 863),
 ('china', 796),
 ('who', 788),
 ('him', 785),
 ('against', 782),
 ('up', 775),
 ('election', 763),
 ('you', 760),
 ('are', 758),
 ('has', 752),
 ('vote', 744),
 ('donald', 7

#### Remove 's

While the fake news frequently or always didn't removed the apostrophe from 's, it doesn't look like that was done to the true news. 's will need to be removed so that it doesn't become a false indicator of true news.

In [None]:
df['clean_title_tokens'] = df['clean_title_tokens'].apply(lambda x: remove_words(x, ["'s"]))
df['clean_text_tokens'] = df['clean_text_tokens'].apply(lambda x: remove_words(x, ["'s"]))

#### Remove Date Words

In [None]:
date_words = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 
              'saturday', 'sunday', 'january', 'february', 'march', 'april',
             'may', 'june', 'july', 'august', 'september', 'october',
             'november', 'december']

In [None]:
df['clean_title_tokens'] = df['clean_title_tokens'].apply(lambda x: remove_words(x, date_words))
df['clean_text_tokens'] = df['clean_text_tokens'].apply(lambda x: remove_words(x, date_words))

In [None]:
df.to_csv('text_pre_processing.csv')