In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craigcalzado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# define categories
categories = ["business", "sports", "technology", "entertainment"]

# use get_all_new_article function from acquire.py file 

news_df = acquire.get_all_news_articles(categories)

In [4]:
# look at the head of dataframe
news_df.head()

Unnamed: 0,title,content,category
0,Rupee closes at all-time low of 77.50 against ...,The Indian rupee weakened further on Monday to...,business
1,India's biggest IPO of LIC subscribed nearly 3...,"LIC's IPO, India's biggest IPO which opened on...",business
2,"Office as we know it, is over: Airbnb CEO on l...",After Airbnb allowed its employees to work rem...,business
3,Twitter will comply with EU content rules afte...,Tesla CEO Elon Musk has said that Twitter will...,business
4,When are you coming to deliver 1st Tesla? Payt...,Paytm CEO Vijay Shekhar Sharma took to Twitter...,business


In [5]:
# lets use the content of first news item as 'article' to test my functions

article = news_df.content[0]
article

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paise over its previous close. During the trading session, the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit.'

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote

In [6]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [7]:
# use the function to clean the article
article_clean = basic_clean(article)

In [8]:
# look at the cleaned article
article_clean

'the indian rupee weakened further on monday to close at a new alltime low of 7750 against the us dollar 60 paise over its previous close during the trading session the rupee touched its lifetime low of 7752 the currency was weighed down by elevated crude oil prices and a widening trade deficit'

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [9]:
def tokenize(string):
    '''
    This function takes in a string and returns a tokenized string.
    
    '''
    # create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [10]:
# Use the function defined above

tokenize(article)

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar , 60 paise over its previous close. During the trading session , the rupee touched its lifetime low of 77.52. The currency was weighed down by elevated crude oil prices and a widening trade deficit .'

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [11]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [12]:
# Use the function defined above
stem(article)

'the indian rupe weaken further on monday to close at a new all-tim low of 77.50 against the us dollar, 60 pais over it previou close. dure the trade session, the rupe touch it lifetim low of 77.52. the currenc wa weigh down by elev crude oil price and a widen trade deficit.'

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [15]:
import nltk

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/craigcalzado/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [18]:
# Use the function defined above
lemmatize(article)

'The Indian rupee weakened further on Monday to close at a new all-time low of 77.50 against the US dollar, 60 paisa over it previous close. During the trading session, the rupee touched it lifetime low of 77.52. The currency wa weighed down by elevated crude oil price and a widening trade deficit.'

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [20]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [21]:
# Use the function defined above
remove_stopwords(article)

'The Indian rupee weakened Monday close new all-time low 77.50 US dollar, 60 paise previous close. During trading session, rupee touched lifetime low 77.52. The currency weighed elevated crude oil prices widening trade deficit.'