## Strings: Text Proprocessing & Normalization

### Basic String Operations

In [None]:
string1 = 'The FIFA World Cup'
string2 = "in Ballon d'Or"                                                                          # double quotes if inner string contains single apostrophe
string3 = "was Jessie's favorite match."                                                            # same here

print(string1 + " " + string2 + " " + string3)

In [None]:
'mm' * 3                                                                                            # you can also multiply strings but division or subtraction doesn't work

In [None]:
a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]
b = [' ' * 2 * (7 - i) + 'oooo' * i for i in a]
for line in b:
    print(line)                                                                                     #this is fun, let's try to figure out what this code will do

## Accessing substrings

In [None]:
string1[4:18]

In [None]:
phrase = 'download webpage, strip HTML if necessary, trim to desired length'

if 'down' in phrase:
    print('found "down"', "| position = ", phrase.find('down'))

## Combining regex with string operations

### Extracting word pieces

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import re

In [None]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(va for word in wsj
                  for va in re.findall(r'[aeiou]{3,}', word))                                               # find most common occurances of words with three vowel combos
fd.most_common(12)

In [None]:
from nltk.corpus import words
wiki_words = words.words()
wiki_words = re.findall(r'\b\w{5}\b', str(wiki_words)) #find all 5 letter words
cvs = [cv for w in wiki_words for cv in re.findall(r'^[qwrtpsdfghjklzxcvbnm][aeiou]', w)]                   # findall starting consonant-vowel combos
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

### Regex for searching across multiple words in a text -- seeing words in context

In [None]:
from nltk.corpus import nps_chat

chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*><.*><bro>")

you rule bro; telling you bro; u twizted bro


In [None]:
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


## Normalizing Text - pretrained and deterministic approaches

### Using regex to stem words

In [None]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ive', 'es', 's', 'ment', 'en', 'ness']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

stem('forgiving')

'forgiv'

In [None]:
re.findall(r'(^.*)(ing|ly|ed|ious|ies|ive|es|s|ment|en|ness)$', 'processes' ) #greedy

[('processe', 's')]

In [None]:
re.findall(r'(^.*?)(ing|ly|ed|ious|ies|ive|es|s|ment|en|ness)$', 'processes' ) #non-greedy

[('process', 'es')]

In [None]:
def stem(word):
    regex = r'(^.*?)(ing|ly|ed|ious|ies|ive|es|s|ment|en|ness)?$'
    stem, suffix = re.findall(regex, word)[0]
    return stem

raw = """As a young industry that champions innovation and rates its practitioners based
on their ability to apprehend (sorry, grok) the continual emergence of new technologies,
frameworks, protocols and data models, we are not particularly familiar with tradition. 
However, the practice of arranging type for optimal pleasure and comfort is a centuries
-old discipline. As long ago as 1927, the noted typographer Jan Tschichold spoke of the
typesetting 'methods and rules upon which it is impossible to improve' — a set of rules
it would be foolish to ignore."""

tokens = word_tokenize(raw)
[stem(t) for t in tokens][0:25]

['A',
 'a',
 'young',
 'industry',
 'that',
 'champion',
 'innovation',
 'and',
 'rat',
 'it',
 'practitioner',
 'bas',
 'on',
 'their',
 'ability',
 'to',
 'apprehend',
 '(',
 'sorry',
 ',',
 'grok',
 ')',
 'the',
 'continual',
 'emergence']

### Porter Stemmer & WordNet Lemmatizer

In [None]:
raw = """As a young industry that champions innovation and rates its practitioners based
on their ability to apprehend (sorry, grok) the continual emergence of new technologies,
frameworks, protocols and data models, we are not particularly familiar with tradition. 
However, the practice of arranging type for optimal pleasure and comfort is a centuries
-old discipline. As long ago as 1927, the noted typographer Jan Tschichold spoke of the
typesetting 'methods and rules upon which it is impossible to improve' — a set of rules
it would be foolish to ignore."""

In [None]:
tokens = word_tokenize(raw) #tokenize the text
tokens = [w.lower() for w in tokens] #set all words to lowercase
tokens[0:30]

['as',
 'a',
 'young',
 'industry',
 'that',
 'champions',
 'innovation',
 'and',
 'rates',
 'its',
 'practitioners',
 'based',
 'on',
 'their',
 'ability',
 'to',
 'apprehend',
 '(',
 'sorry',
 ',',
 'grok',
 ')',
 'the',
 'continual',
 'emergence',
 'of',
 'new',
 'technologies',
 ',',
 'frameworks']

In [None]:
porter = nltk.PorterStemmer()
stemmed = [porter.stem(t) for t in tokens]
stemmed[0:30]

['as',
 'a',
 'young',
 'industri',
 'that',
 'champion',
 'innov',
 'and',
 'rate',
 'it',
 'practition',
 'base',
 'on',
 'their',
 'abil',
 'to',
 'apprehend',
 '(',
 'sorri',
 ',',
 'grok',
 ')',
 'the',
 'continu',
 'emerg',
 'of',
 'new',
 'technolog',
 ',',
 'framework']

In [None]:
wnl = nltk.WordNetLemmatizer()
lemma = [wnl.lemmatize(t) for t in tokens]
lemma[0:30]

['a',
 'a',
 'young',
 'industry',
 'that',
 'champion',
 'innovation',
 'and',
 'rate',
 'it',
 'practitioner',
 'based',
 'on',
 'their',
 'ability',
 'to',
 'apprehend',
 '(',
 'sorry',
 ',',
 'grok',
 ')',
 'the',
 'continual',
 'emergence',
 'of',
 'new',
 'technology',
 ',',
 'framework']

### Regex for Deterministic Tokenization

In [None]:
print(re.split(r' ', raw)) #splitting based on spaces in the text

['As', 'a', 'young', 'industry', 'that', 'champions', 'innovation', 'and', 'rates', 'its', 'practitioners', 'based\non', 'their', 'ability', 'to', 'apprehend', '(sorry,', 'grok)', 'the', 'continual', 'emergence', 'of', 'new', 'technologies,\nframeworks,', 'protocols', 'and', 'data', 'models,', 'we', 'are', 'not', 'particularly', 'familiar', 'with', 'tradition.', '\nHowever,', 'the', 'practice', 'of', 'arranging', 'type', 'for', 'optimal', 'pleasure', 'and', 'comfort', 'is', 'a', 'centuries\n-old', 'discipline.', 'As', 'long', 'ago', 'as', '1927,', 'the', 'noted', 'typographer', 'Jan', 'Tschichold', 'spoke', 'of', 'the\ntypesetting', "'methods", 'and', 'rules', 'upon', 'which', 'it', 'is', 'impossible', 'to', "improve'", '—', 'a', 'set', 'of', 'rules\nit', 'would', 'be', 'foolish', 'to', 'ignore.']


In [None]:
print(re.split(r'\W+', raw)) #split on anything other than a word character

['As', 'a', 'young', 'industry', 'that', 'champions', 'innovation', 'and', 'rates', 'its', 'practitioners', 'based', 'on', 'their', 'ability', 'to', 'apprehend', 'sorry', 'grok', 'the', 'continual', 'emergence', 'of', 'new', 'technologies', 'frameworks', 'protocols', 'and', 'data', 'models', 'we', 'are', 'not', 'particularly', 'familiar', 'with', 'tradition', 'However', 'the', 'practice', 'of', 'arranging', 'type', 'for', 'optimal', 'pleasure', 'and', 'comfort', 'is', 'a', 'centuries', 'old', 'discipline', 'As', 'long', 'ago', 'as', '1927', 'the', 'noted', 'typographer', 'Jan', 'Tschichold', 'spoke', 'of', 'the', 'typesetting', 'methods', 'and', 'rules', 'upon', 'which', 'it', 'is', 'impossible', 'to', 'improve', 'a', 'set', 'of', 'rules', 'it', 'would', 'be', 'foolish', 'to', 'ignore', '']


In [None]:
print(re.findall(r'\w+|\S\w*', raw))
#first find any sequence of words characters or non-whitespace character followed by word characters

['As', 'a', 'young', 'industry', 'that', 'champions', 'innovation', 'and', 'rates', 'its', 'practitioners', 'based', 'on', 'their', 'ability', 'to', 'apprehend', '(sorry', ',', 'grok', ')', 'the', 'continual', 'emergence', 'of', 'new', 'technologies', ',', 'frameworks', ',', 'protocols', 'and', 'data', 'models', ',', 'we', 'are', 'not', 'particularly', 'familiar', 'with', 'tradition', '.', 'However', ',', 'the', 'practice', 'of', 'arranging', 'type', 'for', 'optimal', 'pleasure', 'and', 'comfort', 'is', 'a', 'centuries', '-old', 'discipline', '.', 'As', 'long', 'ago', 'as', '1927', ',', 'the', 'noted', 'typographer', 'Jan', 'Tschichold', 'spoke', 'of', 'the', 'typesetting', "'methods", 'and', 'rules', 'upon', 'which', 'it', 'is', 'impossible', 'to', 'improve', "'", '—', 'a', 'set', 'of', 'rules', 'it', 'would', 'be', 'foolish', 'to', 'ignore', '.']


In [None]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))
#/\w+([-']\w+)/ permit word-internal hyphens separate
#|'| keep quote characters separate from the text they enclose
#[-.(]+ double hyphen, ellipses, and open parantheses tokenized separately

['As', 'a', 'young', 'industry', 'that', 'champions', 'innovation', 'and', 'rates', 'its', 'practitioners', 'based', 'on', 'their', 'ability', 'to', 'apprehend', '(', 'sorry', ',', 'grok', ')', 'the', 'continual', 'emergence', 'of', 'new', 'technologies', ',', 'frameworks', ',', 'protocols', 'and', 'data', 'models', ',', 'we', 'are', 'not', 'particularly', 'familiar', 'with', 'tradition', '.', 'However', ',', 'the', 'practice', 'of', 'arranging', 'type', 'for', 'optimal', 'pleasure', 'and', 'comfort', 'is', 'a', 'centuries', '-', 'old', 'discipline', '.', 'As', 'long', 'ago', 'as', '1927', ',', 'the', 'noted', 'typographer', 'Jan', 'Tschichold', 'spoke', 'of', 'the', 'typesetting', "'", 'methods', 'and', 'rules', 'upon', 'which', 'it', 'is', 'impossible', 'to', 'improve', "'", '—', 'a', 'set', 'of', 'rules', 'it', 'would', 'be', 'foolish', 'to', 'ignore', '.']


### NLTK Regex Tokenizer

In [None]:
text = 'A plane ticket to Washington D.C. from S.L.C. costs $496.78 round-trip...'

pattern = r'''(?x)       # set flag to allow verbose (multiple) regex patterns
        (?:[A-Z]\.)+     # abbreviations, e.g., D.C., S.L.C
    | \w+(?:-\w+)*       # words with optional internal hyphens
    | \$?\d+(?:\.\d+)?%? # currency and percentages
    | \.\.\.             # ellipsis
    | [][.,:"'?():-_`]   # these are separate tokens; includes ], [
    '''
nltk.regexp_tokenize(text, pattern)

['A',
 'plane',
 'ticket',
 'to',
 'Washington',
 'D.C.',
 'from',
 'S.L.C.',
 'costs',
 '$496.78',
 'round-trip',
 '...']

## Sentence Segmentation

In [None]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
print(text)

In [None]:
sents = nltk.sent_tokenize(text)
sents[5:10]

['Weak if we were and foolish, not thus we failed, not thus;\nWhen that black Baal blocked the heavens he had no hymns from us\nChildren we were--our forts of sand were even as weak as eve,\nHigh as they went we piled them up to break that bitter sea.',
 'Fools as we were in motley, all jangling and absurd,\nWhen all church bells were silent our cap and beds were heard.',
 'Not all unhelped we held the fort, our tiny flags unfurled;\nSome giants laboured in that cloud to lift it from the world.',
 'I find again the book we found, I feel the hour that flings\nFar out of fish-shaped Paumanok some cry of cleaner things;\nAnd the Green Carnation withered, as in forest fires that pass,\nRoared in the wind of all the world ten million leaves of grass;\nOr sane and sweet and sudden as a bird sings in the rain--\nTruth out of Tusitala spoke and pleasure out of pain.',
 'Yea, cool and clear and sudden as a bird sings in the grey,\nDunedin to Samoa spoke, and darkness unto day.']