# Text Preprocessing

In [1]:
%%capture

import pandas as pd
from mwdata.text.text_preprocessing import *
from mwdata.utilities.load_data import load_data

## Load Data

In [2]:
addresses = load_data('../Data/Addresses', encoding="latin1")[0].tolist()

In [3]:
addresses[0][:100]

'\nFellow Citizens of the Senate and of the House of Representatives:\nThe assemblage of the representa'

In [4]:
addresses = addresses[:10]

## Tokenize

In [5]:
addresses_token = tokenize(addresses)
addresses_token[0][:10]

['Fellow',
 'Citizens',
 'of',
 'the',
 'Senate',
 'and',
 'of',
 'the',
 'House',
 'of']

## Change to all lowercase

In [6]:
addresses_lower = to_lower(addresses_token)
addresses_lower[0][:10]

['fellow',
 'citizens',
 'of',
 'the',
 'senate',
 'and',
 'of',
 'the',
 'house',
 'of']

## Remove punctuation

In [7]:
addresses_no_punct = remove_punct(addresses_lower)
addresses_no_punct[0][:10]

['fellow',
 'citizens',
 'of',
 'the',
 'senate',
 'and',
 'of',
 'the',
 'house',
 'of']

## Remove digits

In [8]:
remove_digits([['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']])

[['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']]

## Remove single characters and spaces

In [9]:
remove_single_char_and_spaces([['this', 'is', '   ', 'a', 'test', '   ', 'b']])

[['this', 'is', 'test']]

## Remove stopwords

In [10]:
addresses_no_stop = remove_stopwords(addresses_no_punct)
addresses_no_stop[0][:10]

['fellow',
 'citizens',
 'senate',
 'house',
 'representatives',
 'assemblage',
 'representatives',
 'union',
 'houses',
 'congress']

## Stem words

In [11]:
addresses_stemmed = stem(addresses_no_stop)
addresses_stemmed[0][:10]

['fellow',
 'cit',
 'sen',
 'hous',
 'repres',
 'assembl',
 'repres',
 'un',
 'hous',
 'congress']

## Lemmatize words

In [12]:
addresses_lemmatized = lemmatize(addresses_no_stop)
addresses_lemmatized[0][:10]

['fellow',
 'citizen',
 'senate',
 'house',
 'representative',
 'assemblage',
 'representative',
 'union',
 'house',
 'congress']

## Convert back to a single string

In [13]:
addresses_docs = bag_of_words_to_docs(addresses_lemmatized)
addresses_docs_no_digits = bag_of_words_to_docs(remove_digits(addresses_lemmatized))
addresses_docs[0][:1000]

'fellow citizen senate house representative assemblage representative union house congress time occurs circumstance calling renewed homage grateful acknowledgment giver good exception incidental felicitous condition human existence continue highly favored element contribute individual comfort national prosperity survey extensive country generally observe abode health region plenty civil political relation peace without tranquillity within border people increasing unabated rapidity population wealth national resource whatever difference opinion exist among u regard mode mean shall turn beneficence heaven improvement condition yet spirit animating u suffer bounty providence showered upon u vain receive grateful heart apply unwearied hand advancement general good subject recommended congress last session definitively acted upon others left unfinished partly matured recur attention without needing renewal notice purpose communication present view general aspect public affair moment measure

## Create a document-word frequency matrix

In [14]:
create_doc_term_matrix(addresses_docs).head()

Unnamed: 0,000,000m,01,019,02,024,028,029,03,030,...,you,young,youngest,youngstown,youth,zeal,zealous,zero,zimbabwe,zone
0,71,1,5,0,2,0,0,0,6,0,...,0,0,0,0,3,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,2,4,0,0,1,0,0,0,0,0
2,13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,9,1,1,0,0,0,2,0,0


## Create a TF-IDF matrix

In [15]:
create_tfidf_matrix(addresses_docs_no_digits).head()

Unnamed: 0,abandon,abandoned,abandonment,abhorrence,abide,abiding,ability,able,aboard,abode,...,you,young,youngest,youngstown,youth,zeal,zealous,zero,zimbabwe,zone
0,0.008818,0.011336,0.013335,0.0,0.0,0.013335,0.0065,0.00592,0.0,0.013335,...,0.0,0.0,0.0,0.0,0.034009,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011898,0.013401,0.0,...,0.026803,0.035445,0.0,0.0,0.011392,0.0,0.0,0.0,0.0,0.0
2,0.005028,0.0,0.0,0.0,0.0,0.0,0.003707,0.013503,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.005656,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.019618,0.0,0.009563,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004897,0.0,0.0,...,0.0,0.065651,0.011032,0.011032,0.0,0.0,0.0,0.022064,0.0,0.0


## Run a full preprocessing pipeline in one line

In [16]:
bag_of_words_to_docs(preprocess_texts(addresses, lem=True))[0][:1000]

'fellow citizen senate house representative assemblage representative union house congress time occurs circumstance calling renewed homage grateful acknowledgment giver good exception incidental felicitous condition human existence continue highly favored element contribute individual comfort national prosperity survey extensive country generally observe abode health region plenty civil political relation peace without tranquillity within border people increasing unabated rapidity population wealth national resource whatever difference opinion exist among u regard mode mean shall turn beneficence heaven improvement condition yet spirit animating u suffer bounty providence showered upon u vain receive grateful heart apply unwearied hand advancement general good subject recommended congress last session definitively acted upon others left unfinished partly matured recur attention without needing renewal notice purpose communication present view general aspect public affair moment measure

## Ngrams Frequency

In [17]:
n_grams = ngram_freq(text_docs=addresses)
n_grams

FreqDist({('United', 'States'): 254, ('per', 'cent'): 62, ('last', 'session'): 47, ('Great', 'Britain'): 31, ('last', 'year'): 28, ('fiscal', 'year'): 24, ('House', 'Representatives'): 23, ('Federal', 'Government'): 23, ('report', 'Secretary'): 22, ('session', 'Congress'): 21, ...})