In [8]:
import pandas as pd
# load the data
df = pd.read_pickle("../data/binary/us-politicians.pickle")

In [9]:
df.sample(4)

Unnamed: 0,speaker_id,quote_id,quotation,speaker,party
395812,22686,2017-02-16-060234,"Let's go, set up a meeting. I would love to me...",Donald Trump,29468
557921,22686,2018-04-29-012323,Everyone is talking about the fact that the Wh...,Donald Trump,29468
964041,6294,2015-09-29-070971,Mrs. Clinton has never created a job. She has ...,Hillary Clinton,29552
1087229,170581,2019-10-02-122756,We must win thee hearts and minds of the Ameri...,Nancy Pelosi,29552


In [10]:
import re, string, contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

STOPWORDS = set(stopwords.words('english')).union({'i', ''})
STEMMER = PorterStemmer()

def preprocess_quote(quote):
    # to lowercase
    quote = quote.lower()

    # remove numbers and punctuation
    quote = re.sub(r'\d+', '', quote)
    quote = quote.translate(str.maketrans('', '', string.punctuation))

    # remove leading, trailing, and repeating spaces
    quote = re.sub(' +', ' ', quote)
    quote = quote.strip()

    return quote
    
def remove_stopwords(quote):
    quote = contractions.fix(quote)
    tokens = word_tokenize(quote)
    filtered_tokens = [token for token in tokens if token.lower() not in STOPWORDS]
    return " ".join(filtered_tokens)

def stem_quote(quote):
    stemmed_tokens = [STEMMER.stem(token) for token in word_tokenize(quote)]
    return " ".join(stemmed_tokens)

In [11]:
print(STOPWORDS)

{'when', "doesn't", '', 'does', 'him', 'ours', 'were', 'out', 'all', 'you', 'yourselves', 'some', "aren't", 'needn', "mustn't", 'has', 'at', 'after', 'by', "should've", 'below', 'in', 'too', 'during', 'again', "that'll", 'those', 'between', 'his', 'i', 'am', 'now', 'no', 'weren', 've', 'they', 'very', 'them', 'ma', "wasn't", 'not', 'hadn', 'there', 'her', 'here', "hadn't", "didn't", 'for', 'being', "isn't", 'with', 're', 'yourself', 'down', 'same', 'isn', "needn't", 'up', 'shouldn', 'mightn', 'but', 'been', 'how', 'over', "hasn't", "couldn't", 'each', 'until', 'did', 's', 'theirs', 'ain', 'which', 'above', 'my', 'both', 'most', "weren't", "you're", 'of', 'as', "you'd", 'what', 'nor', 'doesn', 'was', 'haven', 'just', 'herself', 'then', 'that', "shouldn't", 'me', 'be', 'because', "you'll", 'before', 'd', "she's", 'about', 'it', 'into', 'own', 'other', 'your', 'hasn', 'myself', 'o', 'don', 'shan', 'against', 'where', 'or', 'our', 'the', 'its', 'few', 'didn', "haven't", 'who', 'having', 't

In [12]:
sample = df.sample(100000)
sample_quotes = sample["quotation"]
sample_quotes

705032       I wouldn't charge it based on what we know now.
1180824    They're just saying `hell, no' to anything tha...
414810                  Who wrote @realDonaldTrump's speech?
1482084    I think that the comments made by my Republica...
556256                             It was all a show to him,
                                 ...                        
1203593                                    That's not civil?
1165970    I think what you'll find, I know what you'll f...
1068102    prevail unless we have the Affordable Care Act...
10121      expensive domestic agenda aimed at improving t...
101453     The high level of deceptive behaviors exhibite...
Name: quotation, Length: 100000, dtype: object

In [13]:
sample_quotes = sample_quotes.apply(lambda quote: remove_stopwords(quote))
sample_quotes

705032                             would charge based know .
1180824         saying ` hell , ' anything president wants ,
414810                   wrote @ realDonaldTrump 's speech ?
1482084    think comments made Republican colleagues outr...
556256                                                show ,
                                 ...                        
1203593                                              civil ?
1165970    think find , know find , starting next week go...
1068102       prevail unless Affordable Care Act protected .
10121      expensive domestic agenda aimed improving fort...
101453     high level deceptive behaviors exhibited Presi...
Name: quotation, Length: 100000, dtype: object

In [14]:
sample_quotes = sample_quotes.apply(lambda quote: preprocess_quote(quote))
sample_quotes

705032                               would charge based know
1180824                 saying hell anything president wants
414810                        wrote realdonaldtrump s speech
1482084    think comments made republican colleagues outr...
556256                                                  show
                                 ...                        
1203593                                                civil
1165970    think find know find starting next week going ...
1068102         prevail unless affordable care act protected
10121      expensive domestic agenda aimed improving fort...
101453     high level deceptive behaviors exhibited presi...
Name: quotation, Length: 100000, dtype: object

In [None]:
tokenized = sample_quotes.apply(lambda quote: word_tokenize(quote))

In [15]:
tokenized.explode().value_counts()[:50]

NameError: name 'tokenized' is not defined