# Working with Text

In [None]:
%matplotlib inline

Libraries for numerics

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats

Libraries for plotting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Libraries for string manipulation

In [None]:
import string
import re

Libraries for functional programming

In [None]:
from functools import reduce, partial
import itertools as it
import operator as op
import toolz as tz
import toolz.curried as c

## String methods

In [None]:
s = "  Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n"

### Removing leading and trailing whitespace

In [None]:
s.strip()

In [None]:
s.lstrip()

In [None]:
s.rstrip()

In [None]:
s = s.strip()

### Changing case

In [None]:
s.lower()

In [None]:
s.upper()

In [None]:
s.title()

### Checking conditions

In [None]:
s.startswith('Avoid')

In [None]:
s.endswith('37')

In [None]:
s.isalpha()

In [None]:
s.isnumeric()

In [None]:
s.isspace()

In [None]:
s.isprintable()

### Counting and indexing

In [None]:
s.count('a')

In [None]:
s.count('gambles')

In [None]:
s.find('gambles')

In [None]:
s[27:]

In [None]:
s.find('foobar')

In [None]:
s.index('gambles')

In [None]:
try:
    s.index('foobar')
except ValueError as e:
    print(e)

### Splitting and joining

In [None]:
s.split()

In [None]:
s.split(':')

In [None]:
'-'.join(s.split())

### Replacing

In [None]:
s.replace('gambles', 'risk')

### Translating

In [None]:
table = str.maketrans(string.ascii_lowercase, string.ascii_uppercase, string.punctuation)
s.translate(table)

In [None]:
table = str.maketrans('', '', string.punctuation)
s.translate(table)

**Exercise: Caesar Cipher**

A Caesar cipher with offset $k$ converts a character into the character $k$ letters down, looping around if this goes past `z`. Non-characters (numbers, spaces, punctuation) are left intact. For instance, with offset=3, we get `abcXYZ` being coded as `defABC`. Write an function `encode(k, s)` where `k` is the offset and `s` the string to be coded.  Write a `decode(k, s)` function that decodes encrypted ciphers. Test it out on the fortune. 

In [None]:
def encode(k, s):    
    table = str.maketrans(
        string.ascii_lowercase + string.ascii_uppercase,
        string.ascii_lowercase[k:] + string.ascii_lowercase[:k] + 
        string.ascii_uppercase[k:] + string.ascii_uppercase[:k])
    return s.translate(table)

In [None]:
encode(3, 'abcXYZ')

In [None]:
def decode(k, s):
    return encode(-k, s)

In [None]:
code = encode(3, s)

In [None]:
code

In [None]:
decode(3, code)

## Counting words

To count words, we typically do the following preprocessing:
    
- Convert to lower (or upper) case
- Remove punctuation
- Split on blank space
- Count each word in list

In [None]:
s

### Preprocessing

In [None]:
words = s.lower().translate(str.maketrans('','',string.punctuation)).split()

### Using a Counter (bag)

In [None]:
from collections import Counter

In [None]:
Counter(words)

### Using a dictionary

In [None]:
counter = {}
for word in words:
    counter[word] = counter.get(word, 0) + 1

In [None]:
counter

### Using a `defaultdict`

In [None]:
from collections import defaultdict

In [None]:
d = defaultdict(int)

In [None]:
for word in words:
    d[word] += 1

In [None]:
d

### Using a functional pipe

In [None]:
tz.pipe(
    s,
    lambda s: s.lower(),
    lambda s: s.translate(str.maketrans('', '', string.punctuation)),
    lambda s: s.split(),
    tz.frequencies
)

### Modification for collection of strings

In [None]:
ss = [s, s, s]

In [None]:
ss

In [None]:
tz.pipe(
    ss,
    c.map(lambda s: s.lower()),
    c.map(lambda s: s.translate(str.maketrans('', '', string.punctuation))),
    c.mapcat(lambda s: s.split()),
    tz.frequencies
)

## String to vector

To analyze text, we typically need to convert it to a vector format. There are several ways to do so. Here we show the most obvious method known as one-hot encoding.

### One hot character encoding

We first encode the string 'abcabc' as the vector [0,1,2,0,1,2]. For one-hot encoding, we next convert this to the one-hot encoded matrix

```python
array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])
```

In [None]:
idx = 0
index = {}
for ch in s:
    if not ch in index:
        index[ch] = idx
        idx += 1

In [None]:
index

#### Categorical encoding

In [None]:
nchars = len(index)

In [None]:
vs = np.array([index[ch] for ch in s])

In [None]:
vs

#### One-hot encoding

In [None]:
n = len(vs)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(vs))
m[i, vs] = 1
m

#### Reverse index lookup

In [None]:
reverse_index = dict(zip(index.values(), index.keys()))

In [None]:
''.join(reverse_index[v] for v in vs)

### One hot encoding for words.

In [None]:
words = ' '.join([s,s]).lower().translate(str.maketrans('', '', string.punctuation)).split()

In [None]:
pos = 0
index = {}
for word in words:
    if word not in index:
        index[word] = pos
        pos += 1

#### Categorical encoding

In [None]:
ws = np.array([index[word] for word in words])

In [None]:
ws

#### One-hot encoding

In [None]:
n = len(ws)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(ws))
m[i, ws] = 1
m

#### Reverse lookup

In [None]:
reverse_index = dict(zip(index.values(), index.keys()))

In [None]:
' '.join(reverse_index[w] for w in ws)

## Regular expressions

In [None]:
s

### Literal match

In [None]:
re.findall(r'gambles', s)

### Quantifiers `.`, `{m,n}`, `+`, `*`

In [None]:
re.findall(r'gam.les', s)

In [None]:
re.findall(r'g.*s', s)

### Non-greedy quantifier.

In [None]:
re.findall(r'g.*?s', s)

### Special characters

In [None]:
re.findall(r'\bg.*?s\b', s)

In [None]:
re.findall(r'\b\w+?\b', s)

In [None]:
re.findall(r'\b\d+?\b', s)

In [None]:
re.findall(r'\b[a-zA-Z]+?\b', s)

### Begin and end anchors

In [None]:
re.findall(r'\w+', s)

In [None]:
re.findall(r'^\w+', s)

In [None]:
re.findall(r'\w+$', s)

### Capture groups

In [None]:
pat = r'\b(\d)(\d)?\b'

In [None]:
re.findall(pat, s)

### Using search and match objects

In [None]:
re.search(pat, s)

In [None]:
m = re.search(pat, s)

In [None]:
m.string

In [None]:
m.group()

In [None]:
m.groups()

### Replacement using capture groups

In [None]:
rep = r'\2\1'
re.sub(pat, rep, s)

### Using compiled patterns

In [None]:
pat = re.compile(r'\b[a-zA-Z]+?\b')
pat.findall(s)