In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craigcalzado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# define categories
categories = ["business", "sports", "technology", "entertainment"]

# use get_all_new_article function from acquire.py file 

news_df = acquire.get_all_news_articles(categories)

In [4]:
# look at the head of dataframe
news_df.head()

Unnamed: 0,title,content,category
0,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business
1,When are you coming to deliver 1st Tesla? Payt...,Paytm CEO Vijay Shekhar Sharma took to Twitter...,business
2,Layout of 'world's first Bitcoin City' in El S...,El Salvador's President Nayib Bukele has share...,business
3,"Bitcoin briefly drops below $30,000 for first ...","Bitcoin, in the early hours of Tuesday, fell b...",business
4,Musk's $44 bn Twitter deal at risk of being re...,Elon Musk's $44 billion offer to buy Twitter c...,business


In [5]:
# lets use the content of first news item as 'article' to test my functions

article = news_df.content[0]
article

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paise over its previous close. During the trading session, the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit.'

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote

In [6]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [7]:
# use the function to clean the article
article_clean = basic_clean(article)

In [8]:
# look at the cleaned article
article_clean

'the indian rupee weakened further on monday to close at a new alltime low of 7750 against the us dollar 60 paise over its previous close during the trading session the rupee touched its lifetime low of 7752 the currency was weighed down by elevated crude oil prices and a widening trade deficit'

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [9]:
def tokenize(string):
    '''
    This function takes in a string and returns a tokenized string.
    
    '''
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [10]:
# Use the function defined above

tokenize(article)

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar , 60 paise over its previous close. During the trading session , the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit .'

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [11]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [12]:
# Use the function defined above
stem(article)

'the indian rupe weaken further on monday to close at a new all-tim low of 77.50 against the us dollar, 60 pais over it previou close. dure the trade session, the rupe touch it lifetim low of 77.52. the currenc wa weigh down by elev crude oil price and a widen trade deficit.'

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [13]:
import nltk

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/craigcalzado/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [16]:
# Use the function defined above
lemmatize(article)

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paisa over it previous close. During the trading session, the rupee touched it lifetime low of 77.52. The currency wa weighed down by elevated crude oil price and a widening trade deficit.'

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words.

These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [17]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [18]:
# Use the function defined above
remove_stopwords(article)

'The Indian rupee weakened Monday close new all-time low 77.50 US dollar, 60 paise previous close. During trading session, rupee touched lifetime low 77.52. The currency weighed elevated crude oil prices widening trade deficit.'

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [19]:
# check news_df dataframe:
news_df.head()

Unnamed: 0,title,content,category
0,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business
1,When are you coming to deliver 1st Tesla? Payt...,Paytm CEO Vijay Shekhar Sharma took to Twitter...,business
2,Layout of 'world's first Bitcoin City' in El S...,El Salvador's President Nayib Bukele has share...,business
3,"Bitcoin briefly drops below $30,000 for first ...","Bitcoin, in the early hours of Tuesday, fell b...",business
4,Musk's $44 bn Twitter deal at risk of being re...,Elon Musk's $44 billion offer to buy Twitter c...,business


In [20]:
# use all the functions to see if they work on news_df's content column

news_df['content'].apply(basic_clean)\
.apply(tokenize)\
.apply(lemmatize)\
.apply(remove_stopwords)

0     indian rupee weakened monday close new alltime...
1     paytm ceo vijay shekhar sharma took twitter re...
2     el salvador president nayib bukele ha shared l...
3     bitcoin early hour tuesday fell 30000 first ti...
4     elon musk 44 billion offer buy twitter could g...
                            ...                        
95    actress mrunal thakur speaking 10 year acting ...
96    actor abhishek bachchan led star football club...
97    doctor strange multiverse madness screenwriter...
98    actress sarika said ran money covid19 pandemic...
99    actor sanjay dutt revealed also whistled yashs...
Name: content, Length: 100, dtype: object

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [21]:
# codeup_df = acquire.get_blogs()

IndexError: list index out of range

In [None]:
# codeup_df.head()

8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [22]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [23]:
# use the function defined above for news_df's content column.

prep_article_data(news_df, 'content', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,indian rupee weakened monday close new alltime...,indian rupe weaken monday close new alltim low...,indian rupee weakened monday close new alltime...
1,When are you coming to deliver 1st Tesla? Payt...,Paytm CEO Vijay Shekhar Sharma took to Twitter...,paytm ceo vijay shekhar sharma took twitter re...,paytm ceo vijay shekhar sharma took twitter re...,paytm ceo vijay shekhar sharma took twitter re...
2,Layout of 'world's first Bitcoin City' in El S...,El Salvador's President Nayib Bukele has share...,el salvadors president nayib bukele shared lay...,el salvador presid nayib bukel share layout wo...,el salvador president nayib bukele shared layo...
3,"Bitcoin briefly drops below $30,000 for first ...","Bitcoin, in the early hours of Tuesday, fell b...",bitcoin early hours tuesday fell 30000 first t...,bitcoin earli hour tuesday fell 30000 first ti...,bitcoin early hour tuesday fell 30000 first ti...
4,Musk's $44 bn Twitter deal at risk of being re...,Elon Musk's $44 billion offer to buy Twitter c...,elon musks 44 billion offer buy twitter could ...,elon musk 44 billion offer buy twitter could g...,elon musk 44 billion offer buy twitter could g...


Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?

I would lemmatize it so that the words that are returned are real words. The dataset is small, so I don't see a waste of resources doing this method over stemming

- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?

I would still lemmatize it...25MB isn't to large

- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

Stemmed, I'll work with what I get before I have to pay, would be very expensive