# Chapter 06

## Handling text

In [10]:
import re

### 6.1 Cleaning text

In [2]:
text = [
    '    Interrobang. By Aishwarya Henriette     ',
    'Parking and Going. By Karl Gautier',
    '     Abscense of the forms of the night. By Jarek Wouldasky  '
]

strip_whitespace = [string.strip() for string in text]
strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking and Going. By Karl Gautier',
 'Abscense of the forms of the night. By Jarek Wouldasky']

In [3]:
remove_periods = [string.replace('.', '') for string in strip_whitespace]
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking and Going By Karl Gautier',
 'Abscense of the forms of the night By Jarek Wouldasky']

In [6]:
def capitalizer(string):
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'ABSCENSE OF THE FORMS OF THE NIGHT BY JAREK WOULDASKY']

In [14]:
def replace_letters_with_X(string):
    return re.sub('[a-zA-Z]', 'X', string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXXXXX XX XXX XXXXX XX XXX XXXXX XX XXXXX XXXXXXXXX']

### 6.2 Parsing and cleaning HTML

In [9]:
from bs4 import BeautifulSoup

html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"

soup = BeautifulSoup(html)
soup.find('div', {'class': 'full_name'}).text

'Masego Azra'

### 6.3 Removing punctuation

In [13]:
import unicodedata
import sys

text = [
    'Hiiiiiii. This is a great song! Dont you think??',
    'It isssss! I fucking love it #Rad #Cool #StoryOfMyLife',
    'I knowwwwwwwwww #LanaDelRey'
]

punctuation = dict.fromkeys(
    i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P')
)

[string.translate(punctuation) for string in text]

['Hiiiiiii This is a great song Dont you think',
 'It isssss I fucking love it Rad Cool StoryOfMyLife',
 'I knowwwwwwwwww LanaDelRey']

### 6.4 Tokenizing text

In [21]:
from nltk.tokenize import word_tokenize

string = 'The science of today is the technology of tomorrow'
word_tokenize(string, preserve_line = True)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [None]:
from nltk.tokenize import sent_tokenize

other_string = 'The science of today is the technology of tomorrow. Tomorrow is today.'
sent_tokenize(other_string)

### 6.5 Removing stop words

In [26]:
from nltk.corpus import stopwords

tokenized_words = [
    'i',
    'am',
    'going',
    'to',
    'go',
    'to',
    'the',
    'store',
    'and',
    'park'
]

stop_words = stopwords.words('english')

[word for word in tokenized_words if word not in stop_words]


['going', 'go', 'store', 'park']

### 6.6 Stemming words

In [27]:
from nltk.stem.porter import PorterStemmer

tokenized_words = ['I', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

porter = PorterStemmer()

[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

### 6.7 