# Text Preprocessing Assignments

## 0. Create a New Environment

In [54]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('../Data/childrens_books.csv')
df.head()
import spacy

## 1. Text Preprocessing with Pandas

1. Read the _childrens_books.csv_ file into a Jupyter Notebook
2. Within the Description column:
* Make all the text lowercase
* Remove all \xa0 characters
* Remove all punctuation

In [55]:
def replace_lower(series):
    output = series.str.lower()
    output = output.str.replace(r'\xa0', '', regex=True)
    output = output.str.replace(r'[^\w\s]','', regex=True)
    return output

In [56]:
df_data = df.copy()
df_data.head(1)

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story."


In [57]:
df_data.Description = replace_lower(df_data.Description)

## 2. Text Preprocessing with spaCy

In addition to the lowercasing and special character removal from the previous assignment, within the cleaned Description column:
* Tokenize the text
* Lemmatize the text
* Remove stop words

In [58]:
nlp = spacy.load('en_core_web_sm')

In [59]:
phrase = 'im selling lemons for $5 today'
doc = nlp(phrase)
doc

im selling lemons for $5 today

In [60]:
[token.text for token in doc]

['i', 'm', 'selling', 'lemons', 'for', '$', '5', 'today']

In [61]:
# normalize the languages to base form 
[token.lemma_ for token in doc]

['I', 'm', 'sell', 'lemon', 'for', '$', '5', 'today']

In [62]:
# stop words -> words without any significant meaning 

norm = [token.lemma_ for token in doc if not token.is_stop]
norm

['m', 'sell', 'lemon', '$', '5', 'today']

In [63]:
#speech tagging 

pos = [(token.text, token.pos_) for token in doc]
pos

[('i', 'PRON'),
 ('m', 'AUX'),
 ('selling', 'VERB'),
 ('lemons', 'NOUN'),
 ('for', 'ADP'),
 ('$', 'SYM'),
 ('5', 'NUM'),
 ('today', 'NOUN')]

In [64]:
phrase = df_data.Description[0]
phrase

'where the wild things arefollows max a young boy who after being sent to his room for misbehaving imagines sailing to an island filled with wild creatures as their king max tames the beasts and eventually returns home to find his supper waiting for him this iconic book explores themes of imagination adventure and the complex emotions of childhood all captured through sendaks whimsical illustrations and story'

In [65]:
doc = nlp(phrase)

In [66]:
# tokenize

[token for token in doc]

[where,
 the,
 wild,
 things,
 arefollows,
 max,
 a,
 young,
 boy,
 who,
 after,
 being,
 sent,
 to,
 his,
 room,
 for,
 misbehaving,
 imagines,
 sailing,
 to,
 an,
 island,
 filled,
 with,
 wild,
 creatures,
 as,
 their,
 king,
 max,
 tames,
 the,
 beasts,
 and,
 eventually,
 returns,
 home,
 to,
 find,
 his,
 supper,
 waiting,
 for,
 him,
 this,
 iconic,
 book,
 explores,
 themes,
 of,
 imagination,
 adventure,
 and,
 the,
 complex,
 emotions,
 of,
 childhood,
 all,
 captured,
 through,
 sendaks,
 whimsical,
 illustrations,
 and,
 story]

In [67]:
[token.text for token in doc]

['where',
 'the',
 'wild',
 'things',
 'arefollows',
 'max',
 'a',
 'young',
 'boy',
 'who',
 'after',
 'being',
 'sent',
 'to',
 'his',
 'room',
 'for',
 'misbehaving',
 'imagines',
 'sailing',
 'to',
 'an',
 'island',
 'filled',
 'with',
 'wild',
 'creatures',
 'as',
 'their',
 'king',
 'max',
 'tames',
 'the',
 'beasts',
 'and',
 'eventually',
 'returns',
 'home',
 'to',
 'find',
 'his',
 'supper',
 'waiting',
 'for',
 'him',
 'this',
 'iconic',
 'book',
 'explores',
 'themes',
 'of',
 'imagination',
 'adventure',
 'and',
 'the',
 'complex',
 'emotions',
 'of',
 'childhood',
 'all',
 'captured',
 'through',
 'sendaks',
 'whimsical',
 'illustrations',
 'and',
 'story']

In [68]:
[token.lemma_ for token in doc]

['where',
 'the',
 'wild',
 'thing',
 'arefollow',
 'max',
 'a',
 'young',
 'boy',
 'who',
 'after',
 'be',
 'send',
 'to',
 'his',
 'room',
 'for',
 'misbehave',
 'imago',
 'sail',
 'to',
 'an',
 'island',
 'fill',
 'with',
 'wild',
 'creature',
 'as',
 'their',
 'king',
 'max',
 'tame',
 'the',
 'beast',
 'and',
 'eventually',
 'return',
 'home',
 'to',
 'find',
 'his',
 'supper',
 'wait',
 'for',
 'he',
 'this',
 'iconic',
 'book',
 'explore',
 'theme',
 'of',
 'imagination',
 'adventure',
 'and',
 'the',
 'complex',
 'emotion',
 'of',
 'childhood',
 'all',
 'capture',
 'through',
 'sendak',
 'whimsical',
 'illustration',
 'and',
 'story']

In [69]:
list(nlp.Defaults.stop_words)[:10]

['twenty',
 'to',
 'becoming',
 'mostly',
 'within',
 'few',
 'any',
 'both',
 'be',
 'besides']

In [70]:
norm = [token.lemma_ for token in doc if not token.is_stop ]

In [71]:
' '.join(norm)

'wild thing arefollow max young boy send room misbehave imago sail island fill wild creature king max tame beast eventually return home find supper wait iconic book explore theme imagination adventure complex emotion childhood capture sendak whimsical illustration story'

In [72]:
def token_lemma_nonstop(text):
    doc = nlp(text)
    norm = [token.lemma_ for token in doc if not token.is_stop]
    output = ' '.join(norm)
    return output

In [73]:
# use apply to apply to the whole data frame
replace_lower(df_data.Description).apply(token_lemma_nonstop)

0                                   wild thing arefollow max young boy send room misbehave imago sail island fill wild creature king max tame beast eventually return home find supper wait iconic book explore theme imagination adventure complex emotion childhood capture sendak whimsical illustration story
1                     hungry caterpillartell story caterpillar eat variety food eventually butterfly eric carles use colorful collage illustration rhythmic text book beloved classic young reader simple engaging story introduce child day week counting concept metamorphosis staple early childhood education
2                                                   give treeis touching bittersweet story tree give boy course life boy grow take tree tree continue little left silverstein minimalist text illustration convey deep theme unconditional love selflessness passage time spark discussion relationship sacrifice
3             ingreen eggs ham samiam try convince reluctant character try dish gr

In [74]:
# speech tagging 
phrase2 = replace_lower(df_data.Description).apply(token_lemma_nonstop)[0]

In [75]:
doc2 = nlp(phrase2)
doc2

wild thing arefollow max young boy send room misbehave imago sail island fill wild creature king max tame beast eventually return home find supper wait iconic book explore theme imagination adventure complex emotion childhood capture sendak whimsical illustration story

In [76]:
nouns = [token.text for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]
' '.join(nouns)

'thing arefollow max boy room misbehave imago sail island fill creature king max tame beast home find supper book theme imagination adventure emotion childhood sendak illustration story'

In [77]:
def filter_pos(text, pos_list=['NOUN', 'PROPN']):
    doc = nlp(text)
    output = [token.text for token in doc if token.pos_ in pos_list]
    output = ' '.join(output)
    return output

In [78]:
replace_lower(df_data.Description).apply(token_lemma_nonstop).apply(filter_pos)

0                          thing arefollow max boy room misbehave imago sail island fill creature king max tame beast home find supper book theme imagination adventure emotion childhood sendak illustration story
1                              caterpillartell story caterpillar eat variety food eric carles collage illustration text book reader story introduce child day week concept metamorphosis staple childhood education
2                                                               story tree boy course life boy tree tree silverstein text illustration theme love selflessness passage time spark discussion relationship sacrifice
3                                          ingreen eggs ham samiam convince character dish egg ham resistance repetition rhyme dr seusss story experience child thing comfort zone illustration dialogue fun reader
4                                      bedtime story little bunny goodnight room moon lady whispering structure comfort tone ideal child illustration cl

## NLP PIPELINE


In [79]:
def replace_lower(series):
    output = series.str.lower()
    output = output.str.replace(r'\xa0', '', regex=True)
    output = output.str.replace(r'[^\w\s]','', regex=True)
    return output

def token_lemma_nonstop(text):
    doc = nlp(text)
    norm = [token.lemma_ for token in doc if not token.is_stop]
    output = ' '.join(norm)
    return output

def filter_pos(text, pos_list=['NOUN', 'PROPN']):
    doc = nlp(text)
    output = [token.text for token in doc if token.pos_ in pos_list]
    output = ' '.join(output)
    return output

In [None]:
def nlp_pipeline(series):
    output = replace_lower(series)
    output = output.apply(token_lemma_nonstop)
    output = output.apply(filter_pos)
    return output

## 3. Count Vectorizer

1. Vectorize the cleaned and normalized text using Count Vectorizer with the default parameters
2. Modify the Count Vectorizer parameters to reduce the number of columns:
* Remove stop words
* Set a minimum document frequency of 10%
3. Use the updated Count Vectorizer to identify the:
* Top 10 most common terms
* Top 10 least common terms that appear in at least 10% of the documents
4. Create a horizontal bar chart of the top 10 most common terms

## 4. TF-IDF Vectorizer

1. Vectorize the cleaned and normalized text using TF-IDF Vectorizer with the default parameters
2. Modify the TF-IDF Vectorizer parameters to reduce the number of columns:
* Remove stop words
* Set a minimum document frequency of 10%
* Set a maximum document frequency of 50%
3. Using the updated TF-IDF Vectorizer, create a  horizontal bar chart of the top 10 most highly weighted terms
4. Compare the Count Vectorizer bar chart from the previous assignment with the TF-IDF Vectorizer bar chart and note the differences in the top term lists