# Text Preprocessing

In [1]:
import pandas as pd
import sys
import os
sys.path.append("..")
from mwdata.text.text_preprocessing import *
from mwdata.utilities.load_data import load_data

## Load Data

In [6]:
addresses = load_data('../Data/Addresses')[0].tolist()

In [7]:
addresses[0][:100]

'\n Fellow-Citizens of the Senate and House of Representatives:\nSince your last annual assembling anot'

In [8]:
addresses = addresses[:10]

## Tokenize

In [9]:
addresses_token = tokenize(addresses)
addresses_token[0][:10]

['Fellow-Citizens',
 'of',
 'the',
 'Senate',
 'and',
 'House',
 'of',
 'Representatives',
 ':',
 'Since']

## Change to all lowercase

In [10]:
addresses_lower = to_lower(addresses_token)
addresses_lower[0][:10]

['fellow-citizens',
 'of',
 'the',
 'senate',
 'and',
 'house',
 'of',
 'representatives',
 ':',
 'since']

## Remove punctuation

In [11]:
addresses_no_punct = remove_punct(addresses_lower)
addresses_no_punct[0][:10]

['fellow-citizens',
 'of',
 'the',
 'senate',
 'and',
 'house',
 'of',
 'representatives',
 'since',
 'your']

## Remove digits

In [12]:
remove_digits([['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']])

[['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']]

## Remove single characters and spaces

In [13]:
remove_single_char_and_spaces([['this', 'is', '   ', 'a', 'test', '   ', 'b']])

[['this', 'is', 'test']]

## Remove stopwords

In [15]:
addresses_no_stop = remove_stopwords(addresses_no_punct)
addresses_no_stop[0][:10]

['fellow-citizens',
 'senate',
 'house',
 'representatives',
 'since',
 'last',
 'annual',
 'assembling',
 'another',
 'year']

## Stem words

In [16]:
addresses_stemmed = stem(addresses_no_stop)
addresses_stemmed[0][:10]

['fellow-citizens',
 'sen',
 'hous',
 'repres',
 'sint',
 'last',
 'an',
 'assembl',
 'anoth',
 'year']

## Lemmatize words

In [17]:
addresses_lemmatized = lemmatize(addresses_no_stop)
addresses_lemmatized[0][:10]

['fellow-citizens',
 'senate',
 'house',
 'representative',
 'since',
 'last',
 'annual',
 'assembling',
 'another',
 'year']

## Convert back to a single string

In [18]:
addresses_docs = bag_of_words_to_docs(addresses_lemmatized)
addresses_docs_no_digits = bag_of_words_to_docs(remove_digits(addresses_lemmatized))
addresses_docs[0][:1000]

'fellow-citizens senate house representative since last annual assembling another year health bountiful harvest passed pleased almighty bless u return peace press guided best light give u trusting good time wise way yet well correspondence touching foreign affair taken place last year herewith submitted virtual compliance request effect made house representative near close last session congress condition relation nation le gratifying usually former period certainly satisfactory nation unhappily distracted might reasonably apprehended month june last ground expect maritime power beginning domestic difficulty unwisely unnecessarily think recognized insurgent belligerent would soon recede position proved le injurious country temporary revers afterwards befell national arm exaggerated disloyal citizen abroad hitherto delayed act simple justice civil war radically changed moment occupation habit american people necessarily disturbed social condition affected deeply prosperity nation carried

## Create a document-word frequency matrix

In [19]:
create_doc_term_matrix(addresses_docs).head()

Unnamed: 0,000,002,003,007,009,01,011,015,018,019,...,yedo,yet,yield,yielded,yielding,york,young,zeal,zealous,zealously
0,19,0,0,0,1,0,0,0,0,0,...,0,10,0,0,0,4,0,0,0,0
1,15,0,0,0,0,0,0,0,1,0,...,0,3,0,1,0,0,0,1,0,0
2,7,0,0,2,0,0,4,2,0,1,...,0,7,2,0,0,2,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,1,4,0,0,0,0,0,1,0,0
4,15,1,1,0,0,13,0,0,0,0,...,0,11,0,0,2,0,0,2,0,0


## Create a TF-IDF matrix

In [20]:
create_tfidf_matrix(addresses_docs_no_digits).head()

Unnamed: 0,abandon,abandoned,abandonment,abdication,abhors,abide,abideth,ability,able,ably,...,yedo,yet,yield,yielded,yielding,york,young,zeal,zealous,zealously
0,0.0,0.0,0.0,0.0,0.01238,0.010524,0.01238,0.0,0.01504,0.0,...,0.0,0.045772,0.0,0.0,0.0,0.032744,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00963,0.013134,0.016217,...,0.0,0.017987,0.0,0.013786,0.0,0.0,0.0,0.00963,0.0,0.0
2,0.0,0.0,0.015512,0.0,0.0,0.0,0.0,0.0,0.022168,0.0,...,0.0,0.047225,0.027142,0.0,0.0,0.024131,0.0,0.0,0.0,0.0
3,0.012807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006973,0.0,...,0.017219,0.025465,0.0,0.0,0.0,0.0,0.0,0.010225,0.0,0.0
4,0.019567,0.0,0.0,0.0,0.0,0.007455,0.0,0.010415,0.017757,0.0,...,0.0,0.035666,0.0,0.0,0.01491,0.0,0.0,0.010415,0.0,0.0


## Run a full preprocessing pipeline in one line

In [21]:
bag_of_words_to_docs(preprocess_texts(addresses, lem=True))[0][:1000]

'fellow-citizens senate house representative since last annual assembling another year health bountiful harvest passed pleased almighty bless u return peace press guided best light give u trusting good time wise way yet well correspondence touching foreign affair taken place last year herewith submitted virtual compliance request effect made house representative near close last session congress condition relation nation le gratifying usually former period certainly satisfactory nation unhappily distracted might reasonably apprehended month june last ground expect maritime power beginning domestic difficulty unwisely unnecessarily think recognized insurgent belligerent would soon recede position proved le injurious country temporary revers afterwards befell national arm exaggerated disloyal citizen abroad hitherto delayed act simple justice civil war radically changed moment occupation habit american people necessarily disturbed social condition affected deeply prosperity nation carried

## Create ngrams

In [22]:
n_grams = create_ngrams(addresses, 4)
n_grams[:10]

[('Fellow-Citizens', 'of', 'the', 'Senate'),
 ('of', 'the', 'Senate', 'and'),
 ('the', 'Senate', 'and', 'House'),
 ('Senate', 'and', 'House', 'of'),
 ('and', 'House', 'of', 'Representatives'),
 ('House', 'of', 'Representatives', ':'),
 ('of', 'Representatives', ':', 'Since'),
 ('Representatives', ':', 'Since', 'your'),
 (':', 'Since', 'your', 'last'),
 ('Since', 'your', 'last', 'annual')]