# Create Data Sets

In [84]:
import pandas as pd

In [85]:
data = [
    "When life gives you lemons, make lemonade! 🙂",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon — there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

In [86]:
pd.set_option("display.max_colwidth", None)

In [87]:
dataDf = pd.DataFrame(data, columns=["sentence"])
dataDf

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


Create a test series of sentences for later on

In [88]:
test = [
    "We're going to start this course with traditional NLP applications.",
    "Then we'll move on to modern NLP theory.",
    "Finally, we'll wrap things up with modern NLP applications."
]
testSeries = pd.Series(test)
testSeries

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

# Pandas Text Preprocessing

In [89]:
df = dataDf.copy()
df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


## Make lowercase

In [90]:
df["sentenceClean"] = df["sentence"].str.lower()
df

Unnamed: 0,sentence,sentenceClean
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade. [allrecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea. [wikipedia]"
7,iced tea is my favorite,iced tea is my favorite


## Remove [ text ]

In [91]:
df["sentenceClean"] = df["sentenceClean"].str.replace(r"\[.*?\]", "", regex=True)
df

Unnamed: 0,sentence,sentenceClean
0,"When life gives you lemons, make lemonade! 🙂","when life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade.
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — there's a great sale today.,he's running to the market to get a lemon — there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea."
7,iced tea is my favorite,iced tea is my favorite


## Remove punctuation

In [92]:
df["sentenceClean"] = df["sentenceClean"].str.replace(r"[^\w\s]", "", regex=True)
df

Unnamed: 0,sentence,sentenceClean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


## Function wrap

In [93]:
def lower_replace(series):
    output = series.str.lower()
    output = output.str.replace(r"\[.*?\]", "", regex=True)
    output = output.str.replace(r"[^\w\s]", "", regex=True)
    return output

In [94]:
lower_replace(testSeries)

0    were going to start this course with traditional nlp applications
1                               then well move on to modern nlp theory
2             finally well wrap things up with modern nlp applications
dtype: object

In [95]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

## Assignment

1. Read the _childrens_books.csv_ file into a Jupyter Notebook
2. Within the Description column:
* Make all the text lowercase
* Remove all \xa0 (*non-breaking space*) characters
* Remove all punctuation

In [96]:
booksDf = pd.read_csv("../Materials/Data/childrens_books.csv")
booksDf.head()

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story."
1,2,The Very Hungry Caterpillar,Eric Carle,1969,4.34,"The Very Hungry Caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. Eric Carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. The simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. It’s a staple in early childhood education."
2,3,The Giving Tree,Shel Silverstein,1964,4.38,"The Giving Tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life. As the boy grows up, he takes more from the tree, and the tree continues to give, even when it has little left. Silverstein’s minimalist text and illustrations convey deep themes of unconditional love, selflessness, and the passage of time. It has sparked much discussion about relationships and sacrifice."
3,4,Green Eggs and Ham,Dr. Seuss,1960,4.31,"In Green Eggs and Ham, Sam-I-Am tries to convince a reluctant character to try a dish of green eggs and ham, despite his resistance. Through repetition and rhyme, Dr. Seuss’s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone. The playful illustrations and humorous dialogue make it a fun and educational read for young readers."
4,5,Goodnight Moon,Margaret Wise Brown,1947,4.31,"Goodnight Moon is a gentle, rhythmic bedtime story where a little bunny says goodnight to everything in his room, from the moon to the ""quiet old lady whispering hush."" Its repetitive structure and comforting tone make it ideal for young children. The simple illustrations by Clement Hurd complement the soothing nature of the story, making it a beloved classic for sleep-time reading."


In [97]:
booksDf["DescriptionClean"] = booksDf["Description"].str.lower()
booksDf

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description,DescriptionClean
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story.","where the wild things are follows max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. as their king, max tames the beasts and eventually returns home to find his supper waiting for him. this iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through sendak's whimsical illustrations and story."
1,2,The Very Hungry Caterpillar,Eric Carle,1969,4.34,"The Very Hungry Caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. Eric Carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. The simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. It’s a staple in early childhood education.","the very hungry caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. eric carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. the simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. it’s a staple in early childhood education."
2,3,The Giving Tree,Shel Silverstein,1964,4.38,"The Giving Tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life. As the boy grows up, he takes more from the tree, and the tree continues to give, even when it has little left. Silverstein’s minimalist text and illustrations convey deep themes of unconditional love, selflessness, and the passage of time. It has sparked much discussion about relationships and sacrifice.","the giving tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life. as the boy grows up, he takes more from the tree, and the tree continues to give, even when it has little left. silverstein’s minimalist text and illustrations convey deep themes of unconditional love, selflessness, and the passage of time. it has sparked much discussion about relationships and sacrifice."
3,4,Green Eggs and Ham,Dr. Seuss,1960,4.31,"In Green Eggs and Ham, Sam-I-Am tries to convince a reluctant character to try a dish of green eggs and ham, despite his resistance. Through repetition and rhyme, Dr. Seuss’s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone. The playful illustrations and humorous dialogue make it a fun and educational read for young readers.","in green eggs and ham, sam-i-am tries to convince a reluctant character to try a dish of green eggs and ham, despite his resistance. through repetition and rhyme, dr. seuss’s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone. the playful illustrations and humorous dialogue make it a fun and educational read for young readers."
4,5,Goodnight Moon,Margaret Wise Brown,1947,4.31,"Goodnight Moon is a gentle, rhythmic bedtime story where a little bunny says goodnight to everything in his room, from the moon to the ""quiet old lady whispering hush."" Its repetitive structure and comforting tone make it ideal for young children. The simple illustrations by Clement Hurd complement the soothing nature of the story, making it a beloved classic for sleep-time reading.","goodnight moon is a gentle, rhythmic bedtime story where a little bunny says goodnight to everything in his room, from the moon to the ""quiet old lady whispering hush."" its repetitive structure and comforting tone make it ideal for young children. the simple illustrations by clement hurd complement the soothing nature of the story, making it a beloved classic for sleep-time reading."
...,...,...,...,...,...,...,...
95,96,Stone Soup,Jon J. Muth,2003,4.18,"Stone Soup is a classic folktale retold by Jon J. Muth, in which three soldiers arrive in a village, claiming they can make soup from stones. The villagers, initially hesitant, eventually contribute ingredients to the pot, and together they create a delicious meal. This story is a heartwarming lesson about sharing, community, and cooperation, enhanced by Muth’s beautiful watercolor illustrations.","stone soup is a classic folktale retold by jon j. muth, in which three soldiers arrive in a village, claiming they can make soup from stones. the villagers, initially hesitant, eventually contribute ingredients to the pot, and together they create a delicious meal. this story is a heartwarming lesson about sharing, community, and cooperation, enhanced by muth’s beautiful watercolor illustrations."
96,97,A Light in the Attic,Shel Silverstein,1981,4.36,"A Light in the Attic is a collection of quirky poems and illustrations by Shel Silverstein, exploring a wide range of topics with wit, humor, and imagination. The poems are funny, thoughtful, and sometimes absurd, but they all carry a sense of wonder and creativity. Silverstein’s distinctive voice and whimsical illustrations make this book a timeless collection that continues to entertain children and adults alike.","a light in the attic is a collection of quirky poems and illustrations by shel silverstein, exploring a wide range of topics with wit, humor, and imagination. the poems are funny, thoughtful, and sometimes absurd, but they all carry a sense of wonder and creativity. silverstein’s distinctive voice and whimsical illustrations make this book a timeless collection that continues to entertain children and adults alike."
97,98,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",J.K. Rowling,1999,4.58,"Harry Potter and the Prisoner of Azkaban is the third book in the Harry Potter series, where Harry returns to Hogwarts for his third year and uncovers secrets about his past. With the arrival of the mysterious Sirius Black, Harry must navigate dark truths and face his fears. This thrilling installment explores themes of loyalty, friendship, and identity, marking a turning point in the magical world of Harry Potter.","harry potter and the prisoner of azkaban is the third book in the harry potter series, where harry returns to hogwarts for his third year and uncovers secrets about his past. with the arrival of the mysterious sirius black, harry must navigate dark truths and face his fears. this thrilling installment explores themes of loyalty, friendship, and identity, marking a turning point in the magical world of harry potter."
98,99,"Harry Potter and the Chamber of Secrets (Harry Potter, #2)",J.K. Rowling,1998,4.43,"Harry Potter and the Chamber of Secrets is the second book in the Harry Potter series, where Harry returns to Hogwarts for his second year and uncovers a hidden chamber within the school. As mysterious events unfold, Harry and his friends Ron and Hermione uncover dark secrets about the school’s past. Themes of courage, friendship, and standing up for what’s right are explored in this gripping magical adventure.","harry potter and the chamber of secrets is the second book in the harry potter series, where harry returns to hogwarts for his second year and uncovers a hidden chamber within the school. as mysterious events unfold, harry and his friends ron and hermione uncover dark secrets about the school’s past. themes of courage, friendship, and standing up for what’s right are explored in this gripping magical adventure."


In [98]:
booksDf["Description"].iloc[0]

"Where the Wild Things Are\xa0follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story."

In [99]:
booksDf["DescriptionClean"] = booksDf["DescriptionClean"].str.replace("\xa0", " ")
booksDf.head()

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description,DescriptionClean
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story.","where the wild things are follows max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. as their king, max tames the beasts and eventually returns home to find his supper waiting for him. this iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through sendak's whimsical illustrations and story."
1,2,The Very Hungry Caterpillar,Eric Carle,1969,4.34,"The Very Hungry Caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. Eric Carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. The simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. It’s a staple in early childhood education.","the very hungry caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. eric carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. the simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. it’s a staple in early childhood education."
2,3,The Giving Tree,Shel Silverstein,1964,4.38,"The Giving Tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life. As the boy grows up, he takes more from the tree, and the tree continues to give, even when it has little left. Silverstein’s minimalist text and illustrations convey deep themes of unconditional love, selflessness, and the passage of time. It has sparked much discussion about relationships and sacrifice.","the giving tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life. as the boy grows up, he takes more from the tree, and the tree continues to give, even when it has little left. silverstein’s minimalist text and illustrations convey deep themes of unconditional love, selflessness, and the passage of time. it has sparked much discussion about relationships and sacrifice."
3,4,Green Eggs and Ham,Dr. Seuss,1960,4.31,"In Green Eggs and Ham, Sam-I-Am tries to convince a reluctant character to try a dish of green eggs and ham, despite his resistance. Through repetition and rhyme, Dr. Seuss’s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone. The playful illustrations and humorous dialogue make it a fun and educational read for young readers.","in green eggs and ham, sam-i-am tries to convince a reluctant character to try a dish of green eggs and ham, despite his resistance. through repetition and rhyme, dr. seuss’s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone. the playful illustrations and humorous dialogue make it a fun and educational read for young readers."
4,5,Goodnight Moon,Margaret Wise Brown,1947,4.31,"Goodnight Moon is a gentle, rhythmic bedtime story where a little bunny says goodnight to everything in his room, from the moon to the ""quiet old lady whispering hush."" Its repetitive structure and comforting tone make it ideal for young children. The simple illustrations by Clement Hurd complement the soothing nature of the story, making it a beloved classic for sleep-time reading.","goodnight moon is a gentle, rhythmic bedtime story where a little bunny says goodnight to everything in his room, from the moon to the ""quiet old lady whispering hush."" its repetitive structure and comforting tone make it ideal for young children. the simple illustrations by clement hurd complement the soothing nature of the story, making it a beloved classic for sleep-time reading."


In [100]:
booksDf["DescriptionClean"] = booksDf["DescriptionClean"].str.replace(r"[^\w\s]", " ", regex=True)
booksDf.head()

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description,DescriptionClean
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story.",where the wild things are follows max a young boy who after being sent to his room for misbehaving imagines sailing to an island filled with wild creatures as their king max tames the beasts and eventually returns home to find his supper waiting for him this iconic book explores themes of imagination adventure and the complex emotions of childhood all captured through sendak s whimsical illustrations and story
1,2,The Very Hungry Caterpillar,Eric Carle,1969,4.34,"The Very Hungry Caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. Eric Carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. The simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. It’s a staple in early childhood education.",the very hungry caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly eric carle s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers the simple engaging story introduces children to days of the week counting and the concept of metamorphosis it s a staple in early childhood education
2,3,The Giving Tree,Shel Silverstein,1964,4.38,"The Giving Tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life. As the boy grows up, he takes more from the tree, and the tree continues to give, even when it has little left. Silverstein’s minimalist text and illustrations convey deep themes of unconditional love, selflessness, and the passage of time. It has sparked much discussion about relationships and sacrifice.",the giving tree is a touching and bittersweet story about a tree that gives everything it has to a boy over the course of his life as the boy grows up he takes more from the tree and the tree continues to give even when it has little left silverstein s minimalist text and illustrations convey deep themes of unconditional love selflessness and the passage of time it has sparked much discussion about relationships and sacrifice
3,4,Green Eggs and Ham,Dr. Seuss,1960,4.31,"In Green Eggs and Ham, Sam-I-Am tries to convince a reluctant character to try a dish of green eggs and ham, despite his resistance. Through repetition and rhyme, Dr. Seuss’s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone. The playful illustrations and humorous dialogue make it a fun and educational read for young readers.",in green eggs and ham sam i am tries to convince a reluctant character to try a dish of green eggs and ham despite his resistance through repetition and rhyme dr seuss s classic story about being open to new experiences encourages children to be adventurous and try things outside their comfort zone the playful illustrations and humorous dialogue make it a fun and educational read for young readers
4,5,Goodnight Moon,Margaret Wise Brown,1947,4.31,"Goodnight Moon is a gentle, rhythmic bedtime story where a little bunny says goodnight to everything in his room, from the moon to the ""quiet old lady whispering hush."" Its repetitive structure and comforting tone make it ideal for young children. The simple illustrations by Clement Hurd complement the soothing nature of the story, making it a beloved classic for sleep-time reading.",goodnight moon is a gentle rhythmic bedtime story where a little bunny says goodnight to everything in his room from the moon to the quiet old lady whispering hush its repetitive structure and comforting tone make it ideal for young children the simple illustrations by clement hurd complement the soothing nature of the story making it a beloved classic for sleep time reading


# SpaCy Text Preprocessing

In [101]:
df

Unnamed: 0,sentence,sentenceClean
0,"When life gives you lemons, make lemonade! 🙂",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon — there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


## Load the SpaCy language model (eng) 

In [102]:
import spacy

`pip install pip-system-certs` to deal with the SSL certificates errors


In [103]:
nlp = spacy.load("en_core_web_sm")

## Get the string and turn it into a SpaCy object

In [104]:
phrase = df["sentenceClean"][0]
phrase

'when life gives you lemons make lemonade '

In [105]:
doc = nlp(phrase)
doc

when life gives you lemons make lemonade 

## Tokenize

In [106]:
[token.text for token in doc]

['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

## Lemmatize

In [107]:
[token.lemma_ for token in doc]

['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

## Stop Words

In [108]:
list(nlp.Defaults.stop_words)[:10]

["n't",
 '‘s',
 'a',
 'across',
 'beside',
 'give',
 'whole',
 'thence',
 'except',
 'if']

In [109]:
norm = [token.lemma_ for token in doc if not token.is_stop]
norm

['life', 'give', 'lemon', 'lemonade']

In [110]:
" ".join(norm)

'life give lemon lemonade'

## Function wrap

In [111]:
def token_lemma_nostop(text):
    doc = nlp(text)
    output = [token.lemma_ for token in doc if not token.is_stop]
    output = " ".join(output)
    return output

In [112]:
testSeries

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [113]:
testSeries.apply(token_lemma_nostop)

0    go start course traditional NLP application .
1                              modern NLP theory .
2    finally , wrap thing modern NLP application .
dtype: object

## Assignment

In [114]:
booksDf.head(2)

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description,DescriptionClean
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story.",where the wild things are follows max a young boy who after being sent to his room for misbehaving imagines sailing to an island filled with wild creatures as their king max tames the beasts and eventually returns home to find his supper waiting for him this iconic book explores themes of imagination adventure and the complex emotions of childhood all captured through sendak s whimsical illustrations and story
1,2,The Very Hungry Caterpillar,Eric Carle,1969,4.34,"The Very Hungry Caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. Eric Carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. The simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. It’s a staple in early childhood education.",the very hungry caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly eric carle s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers the simple engaging story introduces children to days of the week counting and the concept of metamorphosis it s a staple in early childhood education


In [115]:
booksDf["DescriptionClean"] = booksDf["DescriptionClean"].apply(token_lemma_nostop)
booksDf.head(2)

Unnamed: 0,Ranking,Title,Author,Year,Rating,Description,DescriptionClean
0,1,Where the Wild Things Are,Maurice Sendak,1963,4.25,"Where the Wild Things Are follows Max, a young boy who, after being sent to his room for misbehaving, imagines sailing to an island filled with wild creatures. As their king, Max tames the beasts and eventually returns home to find his supper waiting for him. This iconic book explores themes of imagination, adventure, and the complex emotions of childhood, all captured through Sendak's whimsical illustrations and story.",wild thing follow max young boy send room misbehave imago sailing island fill wild creature king max tame beast eventually return home find supper wait iconic book explore theme imagination adventure complex emotion childhood capture sendak s whimsical illustration story
1,2,The Very Hungry Caterpillar,Eric Carle,1969,4.34,"The Very Hungry Caterpillar tells the story of a caterpillar who eats through a variety of foods before eventually becoming a butterfly. Eric Carle’s use of colorful collage illustrations and rhythmic text has made this book a beloved classic for young readers. The simple, engaging story introduces children to days of the week, counting, and the concept of metamorphosis. It’s a staple in early childhood education.",hungry caterpillar tell story caterpillar eat variety food eventually butterfly eric carle s use colorful collage illustration rhythmic text book beloved classic young reader simple engage story introduce child day week counting concept metamorphosis s staple early childhood education


# POS Tagging

Parts Of Speech

In [116]:
phrase2 = lower_replace(df["sentence"]).apply(token_lemma_nostop)[0]
phrase2

'life give lemon lemonade'

In [117]:
doc2 = nlp(phrase2)
doc2

life give lemon lemonade

In [118]:
[(token.text, token.pos_) for token in doc2]

[('life', 'NOUN'), ('give', 'VERB'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

POS Tagging often works better on raw text instead of clean text.<br>
Ther reason is it takes content into account.<br>
Here it thinks "lemonade" is a *proper noun*, which is not the case.<br>

In [119]:
[(token.text, token.pos_) for token in doc2 if token.pos_ in ["NOUN", "PROPN"]]

[('life', 'NOUN'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [120]:
nouns = [token.text for token in doc2 if token.pos_ in ["NOUN", "PROPN"]]
nouns

['life', 'lemon', 'lemonade']

In [121]:
" ".join(nouns)

'life lemon lemonade'

## Function wrap

In [122]:
def filter_pos(text, posList=["NOUN", "PROPN"]):
    doc = nlp(text)
    output = [token.text for token in doc if token.pos_ in posList]
    output = " ".join(output)
    return output

In [123]:
testSeries

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [124]:
testSeries.apply(filter_pos)

0    course NLP applications
1                 NLP theory
2    things NLP applications
dtype: object

# Pandas and SpaCy together

In [125]:
df["sentence"]

0                               When life gives you lemons, make lemonade! 🙂
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon — there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [126]:
lower_replace(df["sentence"]).apply(token_lemma_nostop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

In [127]:
lower_replace(df["sentence"]).apply(token_lemma_nostop).apply(filter_pos)

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

The order in which these pre-processing techniques are applied does matter.<br>
Let's say we first filtered on nouns and then we did the tokenization.

In [128]:
lower_replace(df["sentence"]).apply(filter_pos).apply(token_lemma_nostop)

0              life lemon lemonade
1               lemon maven market
2      dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon
4          market lemon sale today
5        market eureka lemon lemon
6              palmer lemonade tea
7                              tea
Name: sentence, dtype: object

The reason for the difference is linguistic analysis techniques within Spacy take word order into
account.<br>
So when it does those parts of speech labels, it looks at the words around it to determine what part of speech it is.<br>
So down here, because we did the POS filtering first, *iced tea* was the term and only *tea* here was labeled as a noun.<br>
But up there it wasn't the case. It saw both *ice* and *tea* as nouns.

Now you might be wondering which one is the correct approach. And the answer is that it depends.<br>
This is something you would want to test out on your specific text dataset, and see which order makes the most sense for your text, so that you're keeping the words you want to keep and removing the words that you don't.

# NLP Pipeline

Use the defined functions as *helper functions* for the `nlp_pipeline` function

In [129]:
def nlp_pipeline(series):
    output = lower_replace(series)
    output = output.apply(token_lemma_nostop)
    output = output.apply(filter_pos)
    return output

In [130]:
nlp_pipeline(testSeries)

0          nlp application
1               nlp theory
2    thing nlp application
dtype: object

In [131]:
textClean = nlp_pipeline(df["sentence"])
textClean

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

In [132]:
pd.to_pickle(textClean, "textClean.pkl")

Why not just save this output as a flat file, like a CSV file?<br>
First, a **pickle file is really fast and efficient to read and write**.<br>
And then also **it works really well with Python objects**.<br>
So when we create a pickle file from a DataFrame, it saves all the information about that Dataframe.<br>
On the other hand, pickle files are not readable by humans.<br>
So if you want to see what your output actually looks like, you may want to export to a CSV file instead.<br>

In [133]:
pd.to_pickle(booksDf, "booksClean.pkl")