In [44]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire as a
import prepare as p

# Prepare Exercises

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    Lowercase everything
    Normalize unicode characters
    Replace anything that is not a letter, number, whitespace or a single quote.

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

    This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.


In [2]:
# get some text to play with
news_articles = a.get_news_articles()

cached file found and read


In [3]:
test_text1 = news_articles[0]['content']
test_text1

'Benchmark indices Sensex and Nifty ended at record closing highs on Wednesday. Sensex ended 195 points higher at 63,523 while the Nifty ended at 18,856.85, up 40 points. The gains were led by stocks like HDFC, Reliance Industries and TCS. During the intraday trade, Sensex rose to its fresh record high level of 63,588. '

In [4]:
test_text2 = 'I had a bologna sandwich for lunch. It was super-delicious! I do wish it had cheesey cheese and maybe some mustard, mayo, relish, tomatoes, tomato, salt, pepper, butter, with a side of Hostess cakes.'

In [5]:
# make all text lowercase
test = test_text1.lower()
test

'benchmark indices sensex and nifty ended at record closing highs on wednesday. sensex ended 195 points higher at 63,523 while the nifty ended at 18,856.85, up 40 points. the gains were led by stocks like hdfc, reliance industries and tcs. during the intraday trade, sensex rose to its fresh record high level of 63,588. '

In [6]:
# ensure text is "normalized" (remove inconsistincies in unicode character encoding)
# AND make all characters ASCII characters, ignoring any errors in conversion (i.e. drop those chars)
test = unicodedata.normalize('NFKD', test).encode('ascii', 'ignore').decode('utf-8', 'ignore')
test

'benchmark indices sensex and nifty ended at record closing highs on wednesday. sensex ended 195 points higher at 63,523 while the nifty ended at 18,856.85, up 40 points. the gains were led by stocks like hdfc, reliance industries and tcs. during the intraday trade, sensex rose to its fresh record high level of 63,588. '

In [7]:
# drop anything that is not a letter, number, whitespace, or a single quote
test = re.sub(r"[^a-z0-9\s']", '', test)
test

'benchmark indices sensex and nifty ended at record closing highs on wednesday sensex ended 195 points higher at 63523 while the nifty ended at 1885685 up 40 points the gains were led by stocks like hdfc reliance industries and tcs during the intraday trade sensex rose to its fresh record high level of 63588 '

In [8]:
def basic_clean(article):
    """
    This function will perform basic text cleaning on the given article.
    
    Args:
        article (str): The text to be cleaned.

    Returns:
        str: The cleaned article.

    The function applies the following steps to clean the text:
    1. Converts the article to lowercase.
    2. Normalizes the text by removing inconsistencies in unicode character encoding.
    3. Converts all characters to ASCII characters, dropping any characters that cannot be converted.
    4. Removes any characters that are not letters, numbers, whitespace, or a single quote.

    Example:
        article = "Hello, World! This is an example article."
        cleaned_article = basic_clean(article)
        print(cleaned_article)
        # Output: "hello world this is an example article"
    """
    # make all text lower case
    article = article.lower()
    # ensure text is "normalized" (remove inconsistincies in unicode character encoding)
    # AND make all characters ASCII characters, ignoring any errors in conversion (i.e. drop those chars)
    article = unicodedata.normalize('NFKD', article).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # making hyphenated words into two words separated by a space, using positive look behind and pos look ahead
    article = re.sub(r'(?<=\w)-(?=\w)', ' ', article)
    # drop anything that is not a letter, number, whitespace, or a single quote
    article = re.sub(r"[^a-z0-9\s']", '', article)

    return article

In [9]:
# found this on chat gpt. it said it is using "positive look behind and positive look ahead"
test = 'super-delicious'
re.sub(r"(?<=\w)-(?=\w)", ' ', test)

'super delicious'

In [10]:
(test_text1, test_text2)

('Benchmark indices Sensex and Nifty ended at record closing highs on Wednesday. Sensex ended 195 points higher at 63,523 while the Nifty ended at 18,856.85, up 40 points. The gains were led by stocks like HDFC, Reliance Industries and TCS. During the intraday trade, Sensex rose to its fresh record high level of 63,588. ',
 'I had a bologna sandwich for lunch. It was super-delicious! I do wish it had cheesey cheese and maybe some mustard, mayo, relish, tomatoes, tomato, salt, pepper, butter, with a side of Hostess cakes.')

In [11]:
basic1 = basic_clean(test_text1)
basic2 = basic_clean(test_text2)
(basic1, basic2)

('benchmark indices sensex and nifty ended at record closing highs on wednesday sensex ended 195 points higher at 63523 while the nifty ended at 1885685 up 40 points the gains were led by stocks like hdfc reliance industries and tcs during the intraday trade sensex rose to its fresh record high level of 63588 ',
 'i had a bologna sandwich for lunch it was super delicious i do wish it had cheesey cheese and maybe some mustard mayo relish tomatoes tomato salt pepper butter with a side of hostess cakes')

In [12]:
tokenizer = nltk.tokenize.ToktokTokenizer()
token1 = tokenizer.tokenize(basic1, return_str=True)
token2 = tokenizer.tokenize(basic2, return_str=True)
(token1, token2)

('benchmark indices sensex and nifty ended at record closing highs on wednesday sensex ended 195 points higher at 63523 while the nifty ended at 1885685 up 40 points the gains were led by stocks like hdfc reliance industries and tcs during the intraday trade sensex rose to its fresh record high level of 63588',
 'i had a bologna sandwich for lunch it was super delicious i do wish it had cheesey cheese and maybe some mustard mayo relish tomatoes tomato salt pepper butter with a side of hostess cakes')

In [13]:
def tokenize (article):
    """
    This function will
    - accept a string, article which has been processed with basic_clean
    - use nltk.tokenize.ToktokTokenizer to break words and any punctuation left over into discrete units
    - returns processed string
    """
    # make the tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # use tokenizer object to tokenize the article
    article = tokenizer.tokenize(article, return_str=True)
    # return tokenized article
    return article

In [14]:
token1 = tokenize(basic1)
token2 = tokenize(basic2)
(token1, token2)

('benchmark indices sensex and nifty ended at record closing highs on wednesday sensex ended 195 points higher at 63523 while the nifty ended at 1885685 up 40 points the gains were led by stocks like hdfc reliance industries and tcs during the intraday trade sensex rose to its fresh record high level of 63588',
 'i had a bologna sandwich for lunch it was super delicious i do wish it had cheesey cheese and maybe some mustard mayo relish tomatoes tomato salt pepper butter with a side of hostess cakes')

In [15]:
def stem(article):
    """
    This function will
    - accept a string, article that has been processed with basic_clean and tokenize
    - get word stems for each word
    - joins the word stems back together in one string and returns that string
    """
    # create the nltk stemmer object
    ps = nltk.porter.PorterStemmer()
    # use ps to get stems of words
    stems = [ps.stem(word) for word in article.split(' ')]
    # join those words back together and return the string
    return ' '.join(stems)

In [16]:
# create the nltk stemmer object
ps = nltk.porter.PorterStemmer()
stems1 = [ps.stem(word) for word in token1.split(' ')]
stems2 = [ps.stem(word) for word in token2.split(' ')]

In [17]:
stems1 = stem(token1)
stems2 = stem(token2)
(stems1, stems2)

('benchmark indic sensex and nifti end at record close high on wednesday sensex end 195 point higher at 63523 while the nifti end at 1885685 up 40 point the gain were led by stock like hdfc relianc industri and tc dure the intraday trade sensex rose to it fresh record high level of 63588',
 'i had a bologna sandwich for lunch it wa super delici i do wish it had cheesey chees and mayb some mustard mayo relish tomato tomato salt pepper butter with a side of hostess cake')

In [18]:
def lemmatize(article):
    """
    This function will
    - accept a string, article that has been processed with basic_clean and tokenize
    - get word roots for each word using nltk.stem.WordNetLemmatizer
    - joins the word roots back together in one string and returns that string
"""
    # create the wnl lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use wnl to get stems of words
    lemmas = [wnl.lemmatize(word) for word in article.split(' ')]
    # join those words back together and return the string
    return ' '.join(lemmas)

In [19]:
lemmas1 = lemmatize(token1)
lemmas2 = lemmatize(token2)
(lemmas1, lemmas2)

('benchmark index sensex and nifty ended at record closing high on wednesday sensex ended 195 point higher at 63523 while the nifty ended at 1885685 up 40 point the gain were led by stock like hdfc reliance industry and tc during the intraday trade sensex rose to it fresh record high level of 63588',
 'i had a bologna sandwich for lunch it wa super delicious i do wish it had cheesey cheese and maybe some mustard mayo relish tomato tomato salt pepper butter with a side of hostess cake')

In [56]:
def remove_stopwords(article, extra_words=[], exclude_words=[]):
    """
    This function will
    - accept a string, article
    - accept two optional lists, extra_words and exclude_words
    - remove the stopwords from the string (ex: 'a', 'an', 'the', etc.)
        - however, it will specifically not remove words that are in exclude_words, even if those words are in the 
          default stopwords list
        - additionally, it will also remove any words in extra_words
    - returns the string of non-stopwords
    """
    # get a list of stopwords
    stopword_list = stopwords.words('english')
    # add in the extra words to remove, use | instead of +
    stopword_list = list( set(stopword_list) | set(extra_words) )
    # subtract out the words we don't want to remove
    stopword_list = list( set(stopword_list) - set(exclude_words) )
    # make a list of words in article to iterate on
    words = article.split(' ')
    # only keep words not in stopword_list
    filtered_words = [word for word in words if word not in stopword_list]
    # print (f'Removed {len(words) - len(filtered_words)} stopwords.')
    # join the filtered words back into one string and return it
    return ' '.join(filtered_words)

In [21]:
final1 = remove_stopwords(lemmas1)
final2 = remove_stopwords(lemmas2)
(final1, final2)

Removed 17 stopwords.
Removed 14 stopwords.


('benchmark index sensex nifty ended record closing high wednesday sensex ended 195 point higher 63523 nifty ended 1885685 40 point gain led stock like hdfc reliance industry tc intraday trade sensex rose fresh record high level 63588',
 'bologna sandwich lunch wa super delicious wish cheesey cheese maybe mustard mayo relish tomato tomato salt pepper butter side hostess cake')

In [22]:
# investigation below in order to make the function

In [23]:
# get standard stopwords
stopword_list = stopwords.words('english')
stopword_list[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [24]:
len(stopword_list)

179

In [25]:
# define some extra words to remove and some exclude_words to leave in
extra_words=['tomato', 'salt']
exclude_words=['a', 'it']

In [26]:
# one way to "subtract" one list of words from another
stopword_list = list (set(stopword_list) - set(exclude_words))

# or another way, have to remove words one at a time. ALSO, 
#  this will throw a fault if the word isn't already in the list you want to remove from
#stopword_list.remove(exclude_words[0])

In [27]:
sorted(stopword_list)[:10]

['about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and']

In [28]:
# to add in words, either append one at a time or do the set thing again; use | instead of 
stopword_list = list (set(stopword_list) | set(extra_words) )
stopword_list[:10]

['hadn', 'in', 'you', 'will', 'our', 'the', 'had', 'who', 'now', 'can']

In [29]:
len(stopword_list)

179

In [30]:
stopword_list = list( set(stopword_list) - set([]) )

In [31]:
len(stopword_list)

179

# Exercises
## 6 acquire a dataframe of the news articles named news_df
## 7 acquire a dataframe of the codeup blogs called codeup_df
## 8 for each df, produce the following columns
    * title
    * original
    * clean
    * stemmed
    * lemmatized

In [32]:
# get the news_df
news_list = a.get_news_articles()
news_df = pd.DataFrame(news_list)
news_df = news_df.rename(columns = {'content':'original'})
news_df.head()

cached file found and read


Unnamed: 0,category,title,original
0,business,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...
1,business,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...
2,business,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...
3,business,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...
4,business,UK's net debt passes 100% of GDP for the first...,The United Kingdom's public sector net debt in...


In [53]:
codeup_list = a.get_all_codeup_blogs()
codeup_df = pd.DataFrame(codeup_list)
codeup_df = codeup_df.rename(columns = {'content':'original'})
codeup_df.head()

cached file found and read


Unnamed: 0,title,original
0,Spotlight on APIDA Voices: Celebrating Heritag...,Spotlight on APIDA Voices: Celebrating Heritag...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight - Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight - Sarah Mell...,Women in Tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight - Madeleine ...,Women in Tech: Panelist Spotlight – Madeleine ...


In [52]:
def get_clean_column(df):
    """
    This function will
    - accept a df with at least one column (named 'original') of text to process
    - process the text in each row of that column with the basic_clean function
    - add a column, named 'clean', to the df with processed text
    - return the new df
    """
    df['clean'] = pd.Series([basic_clean(s) for s in df.original])
    df['clean'] = pd.Series([tokenize(s) for s in df.clean])
    df['clean'] = pd.Series([remove_stopwords(s) for s in df.clean])
    
    return df

In [None]:
# ALSO df['clean'] = df.original.apply(basic_clean).apply(tokenize).apply(remove_words)

In [57]:
codeup_df = get_clean_column(codeup_df)
codeup_df

Unnamed: 0,title,original,clean
0,Spotlight on APIDA Voices: Celebrating Heritag...,Spotlight on APIDA Voices: Celebrating Heritag...,spotlight apida voices celebrating heritage in...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women tech panelist spotlight magdalena rahn m...
2,Women in tech: Panelist Spotlight - Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women tech panelist spotlight rachel robbins m...
3,Women in Tech: Panelist Spotlight - Sarah Mell...,Women in Tech: Panelist Spotlight – Sarah Mell...,women tech panelist spotlight sarah mellor mar...
4,Women in Tech: Panelist Spotlight - Madeleine ...,Women in Tech: Panelist Spotlight – Madeleine ...,women tech panelist spotlight madeleine capper...
...,...,...,...
265,Why Isn't the San Antonio Tech Scene Growing F...,Why Isn’t the San Antonio Tech Scene Growing F...,isnt san antonio tech scene growing faster aug...
266,Why People Can't Learn Programming on Their Ow...,Why People Can’t Learn Programming on Their Ow...,people cant learn programming aug 14 2018 code...
267,What is Our Noble Cause? - Codeup,"What is Our Noble Cause? Aug 14, 2018 | Codeup...",noble cause aug 14 2018 codeup news tedx san a...
268,Scholarships for Women: Why We're Doing It - C...,Scholarships for Women: Why We’re Doing It Aug...,scholarships women aug 14 2018 codeup news hot...


In [36]:
def get_stemmed_column(df):
    """
    This function will
    - accept a df with at least one column (named 'clean') of text to process
    - process the text in each row of that column with the stem function
    - add a column named 'stemmed' to the df with processed text
    - return the new df
    """
    df['stemmed'] = pd.Series([stem(s) for s in df.clean])
    
    return df

In [47]:
codeup_df = p.get_stemmed_column(codeup_df)
codeup_df

Unnamed: 0,title,original,clean,stemmed
0,Spotlight on APIDA Voices: Celebrating Heritag...,Spotlight on APIDA Voices: Celebrating Heritag...,spotlight on apida voices celebrating heritage...,spotlight on apida voic celebr heritag and ins...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena ra...,women in tech panelist spotlight magdalena ra...
2,Women in tech: Panelist Spotlight - Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women in tech panelist spotlight rachel robbi...,women in tech panelist spotlight rachel robbi...
3,Women in Tech: Panelist Spotlight - Sarah Mell...,Women in Tech: Panelist Spotlight – Sarah Mell...,women in tech panelist spotlight sarah mellor...,women in tech panelist spotlight sarah mellor...
4,Women in Tech: Panelist Spotlight - Madeleine ...,Women in Tech: Panelist Spotlight – Madeleine ...,women in tech panelist spotlight madeleine ca...,women in tech panelist spotlight madelein cap...
...,...,...,...,...
265,Why Isn't the San Antonio Tech Scene Growing F...,Why Isn’t the San Antonio Tech Scene Growing F...,why isnt the san antonio tech scene growing fa...,whi isnt the san antonio tech scene grow faste...
266,Why People Can't Learn Programming on Their Ow...,Why People Can’t Learn Programming on Their Ow...,why people cant learn programming on their own...,whi peopl cant learn program on their own aug ...
267,What is Our Noble Cause? - Codeup,"What is Our Noble Cause? Aug 14, 2018 | Codeup...",what is our noble cause aug 14 2018 codeup ne...,what is our nobl caus aug 14 2018 codeup news...
268,Scholarships for Women: Why We're Doing It - C...,Scholarships for Women: Why We’re Doing It Aug...,scholarships for women why were doing it aug 1...,scholarship for women whi were do it aug 14 20...


In [38]:
def get_lemmatized_column(df):
    """
    This function will
    - accept a df with at least one column (named 'clean') of text to process
    - process the text in each row of that column with the lemmatize function
    - add a column named 'lemmatized' to the df with processed text
    - return the new df
    """
    df['lemmatized'] = pd.Series([lemmatize(s) for s in df.clean])
    
    return df

In [48]:
codeup_df = p.get_lemmatized_column(codeup_df)
codeup_df

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,Spotlight on APIDA Voices: Celebrating Heritag...,spotlight on apida voices celebrating heritage...,spotlight on apida voic celebr heritag and ins...,spotlight on apida voice celebrating heritage ...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena ra...,women in tech panelist spotlight magdalena ra...,woman in tech panelist spotlight magdalena ra...
2,Women in tech: Panelist Spotlight - Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women in tech panelist spotlight rachel robbi...,women in tech panelist spotlight rachel robbi...,woman in tech panelist spotlight rachel robbi...
3,Women in Tech: Panelist Spotlight - Sarah Mell...,Women in Tech: Panelist Spotlight – Sarah Mell...,women in tech panelist spotlight sarah mellor...,women in tech panelist spotlight sarah mellor...,woman in tech panelist spotlight sarah mellor...
4,Women in Tech: Panelist Spotlight - Madeleine ...,Women in Tech: Panelist Spotlight – Madeleine ...,women in tech panelist spotlight madeleine ca...,women in tech panelist spotlight madelein cap...,woman in tech panelist spotlight madeleine ca...
...,...,...,...,...,...
265,Why Isn't the San Antonio Tech Scene Growing F...,Why Isn’t the San Antonio Tech Scene Growing F...,why isnt the san antonio tech scene growing fa...,whi isnt the san antonio tech scene grow faste...,why isnt the san antonio tech scene growing fa...
266,Why People Can't Learn Programming on Their Ow...,Why People Can’t Learn Programming on Their Ow...,why people cant learn programming on their own...,whi peopl cant learn program on their own aug ...,why people cant learn programming on their own...
267,What is Our Noble Cause? - Codeup,"What is Our Noble Cause? Aug 14, 2018 | Codeup...",what is our noble cause aug 14 2018 codeup ne...,what is our nobl caus aug 14 2018 codeup news...,what is our noble cause aug 14 2018 codeup ne...
268,Scholarships for Women: Why We're Doing It - C...,Scholarships for Women: Why We’re Doing It Aug...,scholarships for women why were doing it aug 1...,scholarship for women whi were do it aug 14 20...,scholarship for woman why were doing it aug 14...


In [40]:
news_df = get_clean_column(news_df)
news_df.head()

Unnamed: 0,category,title,original,clean
0,business,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,benchmark indices sensex and nifty ended at re...
1,business,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,time magazine has released its annual list of ...
2,business,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,singapore airlines is the world's best airline...
3,business,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,paytm founder vijay shekhar sharma shared a vi...
4,business,UK's net debt passes 100% of GDP for the first...,The United Kingdom's public sector net debt in...,the united kingdom's public sector net debt in...


In [41]:
news_df = get_stemmed_column(news_df)
news_df.head()

Unnamed: 0,category,title,original,clean,stemmed
0,business,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,benchmark indices sensex and nifty ended at re...,benchmark indic sensex and nifti end at record...
1,business,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,time magazine has released its annual list of ...,time magazin ha releas it annual list of the w...
2,business,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,singapore airlines is the world's best airline...,singapor airlin is the world' best airlin acco...
3,business,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,paytm founder vijay shekhar sharma shared a vi...,paytm founder vijay shekhar sharma share a vid...
4,business,UK's net debt passes 100% of GDP for the first...,The United Kingdom's public sector net debt in...,the united kingdom's public sector net debt in...,the unit kingdom' public sector net debt in ma...


In [42]:
news_df = get_lemmatized_column(news_df)
news_df.head()

Unnamed: 0,category,title,original,clean,stemmed,lemmatized
0,business,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,benchmark indices sensex and nifty ended at re...,benchmark indic sensex and nifti end at record...,benchmark index sensex and nifty ended at reco...
1,business,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,time magazine has released its annual list of ...,time magazin ha releas it annual list of the w...,time magazine ha released it annual list of th...
2,business,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,singapore airlines is the world's best airline...,singapor airlin is the world' best airlin acco...,singapore airline is the world's best airline ...
3,business,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,paytm founder vijay shekhar sharma shared a vi...,paytm founder vijay shekhar sharma share a vid...,paytm founder vijay shekhar sharma shared a vi...
4,business,UK's net debt passes 100% of GDP for the first...,The United Kingdom's public sector net debt in...,the united kingdom's public sector net debt in...,the unit kingdom' public sector net debt in ma...,the united kingdom's public sector net debt in...


# NEED TO MAKE A FUNCTION THAT DOES ALL OF THE ABOVE, clean_df

# Exercise 
## 9. Ask yourself:

    * If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
        * Probably doesn't matter
    * If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
        * Still probably doesn't matter, but I know stemmed will run faster
    * If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
        * lemmatized word strings were consistently longer than stemmed word strings, so stemmed


In [43]:
for i in range (0, len(news_df)):
    print(f'stem_len: {len(news_df.stemmed[i])}, lemma_len: {len(news_df.lemmatized[i])}')

stem_len: 287, lemma_len: 298
stem_len: 339, lemma_len: 369
stem_len: 319, lemma_len: 343
stem_len: 311, lemma_len: 329
stem_len: 329, lemma_len: 359
stem_len: 333, lemma_len: 372
stem_len: 287, lemma_len: 305
stem_len: 310, lemma_len: 336
stem_len: 321, lemma_len: 350
stem_len: 309, lemma_len: 341
stem_len: 309, lemma_len: 337
stem_len: 332, lemma_len: 380
stem_len: 338, lemma_len: 364
stem_len: 345, lemma_len: 367
stem_len: 313, lemma_len: 338
stem_len: 320, lemma_len: 339
stem_len: 339, lemma_len: 361
stem_len: 297, lemma_len: 316
stem_len: 318, lemma_len: 362
stem_len: 339, lemma_len: 372
stem_len: 284, lemma_len: 308
stem_len: 313, lemma_len: 358
stem_len: 324, lemma_len: 361
stem_len: 319, lemma_len: 359
stem_len: 308, lemma_len: 350
stem_len: 325, lemma_len: 352
stem_len: 256, lemma_len: 290
stem_len: 320, lemma_len: 337
stem_len: 353, lemma_len: 368
stem_len: 326, lemma_len: 363
stem_len: 322, lemma_len: 345
stem_len: 330, lemma_len: 342
stem_len: 299, lemma_len: 309
stem_len: 