# Chapter 06

## Handling text

In [None]:
import re

### 6.1 Cleaning text

In [None]:
text = [
    '    Interrobang. By Aishwarya Henriette     ',
    'Parking and Going. By Karl Gautier',
    '     Abscense of the forms of the night. By Jarek Wouldasky  '
]

strip_whitespace = [string.strip() for string in text]
strip_whitespace

In [None]:
remove_periods = [string.replace('.', '') for string in strip_whitespace]
remove_periods

In [None]:
def capitalizer(string):
    return string.upper()

[capitalizer(string) for string in remove_periods]

In [None]:
def replace_letters_with_X(string):
    return re.sub('[a-zA-Z]', 'X', string)

[replace_letters_with_X(string) for string in remove_periods]

### 6.2 Parsing and cleaning HTML

In [None]:
from bs4 import BeautifulSoup

html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"

soup = BeautifulSoup(html)
soup.find('div', {'class': 'full_name'}).text

### 6.3 Removing punctuation

In [None]:
import unicodedata
import sys

text = [
    'Hiiiiiii. This is a great song! Dont you think??',
    'It isssss! I fucking love it #Rad #Cool #StoryOfMyLife',
    'I knowwwwwwwwww #LanaDelRey'
]

punctuation = dict.fromkeys(
    i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P')
)

[string.translate(punctuation) for string in text]

### 6.4 Tokenizing text

In [None]:
from nltk.tokenize import word_tokenize

string = 'The science of today is the technology of tomorrow'
word_tokenize(string, preserve_line = True)

In [None]:
from nltk.tokenize import sent_tokenize

other_string = 'The science of today is the technology of tomorrow. Tomorrow is today.'
sent_tokenize(other_string)

### 6.5 Removing stop words

In [None]:
from nltk.corpus import stopwords

tokenized_words = [
    'i',
    'am',
    'going',
    'to',
    'go',
    'to',
    'the',
    'store',
    'and',
    'park'
]

stop_words = stopwords.words('english')

[word for word in tokenized_words if word not in stop_words]


### 6.6 Stemming words

In [None]:
from nltk.stem.porter import PorterStemmer

tokenized_words = ['I', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

porter = PorterStemmer()

[porter.stem(word) for word in tokenized_words]

### 6.7 Tagging parts of speech

In [None]:
from nltk import pos_tag, word_tokenize

In [None]:
pos_tag(word_tokenize('Chris loved outdoor running', preserve_line = True), lang='eng')

### 6.8 Enconding text as a bag of words

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
text = np.array([
    'I love Brazil. Brazil!',
    'Sweden is the best place to live in',
    'Germany is also great'
])

count = CountVectorizer()
bag_of_words = count.fit_transform(text)
bag_of_words.toarray()

In [None]:
count.get_feature_names_out()

In [None]:
count_2gram = CountVectorizer(
    ngram_range=(1,2),
    stop_words='english',
    vocabulary=['brazil']
)

bag = count_2gram.fit_transform(text)

bag.toarray()

In [None]:
count_2gram.vocabulary_

### 6.9 Weighting word importance

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text)
feature_matrix

In [None]:
feature_matrix.toarray()

In [None]:
tfidf.vocabulary_