In [1]:
import unicodedata
import re
import json

import nltk
nltk.download('stopwords')
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gregmaggard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Acquiring Data:

# Exercises:

## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    string = re.sub(r"[^\da-z\'\s]", '', string)
    return string

In [3]:
basic_clean("į åm ällergic to pótatòēs. It makes me sad.")

'i am allergic to potatoes it makes me sad'

## 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    string = tokenizer.tokenize(string, return_str=True)
    return string

In [5]:
tokenize("I'm taking my favorite puppy to the grooming salon.")

"I ' m taking my favorite puppy to the grooming salon ."

## 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [6]:
def stem(string):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    return string_stemmed

In [7]:
stem("I'm taking my favorite puppy to the grooming salon.")

"i'm take my favorit puppi to the groom salon."

## 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [8]:
def lemmatize(string):
    ps = nltk.porter.PorterStemmer()
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    return lemmas

In [9]:
lemmatize("I took my favorite puppy to the grooming salon.")

['I', 'took', 'my', 'favorite', 'puppy', 'to', 'the', 'grooming', 'salon.']

##  5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [22]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    stopword_list = stopwords.words('english')
    words = string.split()
    filtered_words = [w for w in words if w not in stopword_list]
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

In [11]:
remove_stopwords("We are such stuff as dreams are made on.")

'We stuff dreams made on.'

## 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
categories = ['business', 'sports', 'technology', 'entertainment']

news_df = acquire.get_shorts_articles(categories, refresh = True)

In [15]:
news_df.head()

Unnamed: 0,title,contents,category
0,Rupee hits 80 per US dollar for the first time...,The Indian rupee touched 80 per US dollar for ...,business
1,ED arrests ex-Mumbai Police chief Sanjay Pande...,The Enforcement Directorate (ED) on Tuesday ar...,business
2,Who are now the world's 10 richest people as A...,Gautam Adani has overtaken Bill Gates to becom...,business
3,Gautam Adani overtakes Bill Gates to become wo...,Gautam Adani has overtaken Bill Gates to becom...,business
4,List of items exempt from GST when sold loose ...,Amid criticism over pre-packaged and pre-label...,business


## 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [None]:
urls = ['https://codeup.com/data-science/jobs-after-a-coding-bootcamp-part-1-data-science/', 
        'https://codeup.com/featured/what-jobs-can-you-get-after-a-coding-bootcamp-part-2-cloud-administration/',
        'https://codeup.com/tips-for-prospective-students/is-our-cloud-administration-program-right-for-you/',
        'https://codeup.com/tips-for-prospective-students/mental-health-first-aid-training/',
        'https://codeup.com/codeup-news/inclusion-at-codeup-during-pride-month-and-always/']

blog_df = acquire.get_blog_articles(urls, refresh = True)

## 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [33]:
def prep_article_data(df, column, extra_words = [], exclude_words = []):
    '''
    Takes in a DataFrame and a column name in string format, optionally 
    including lists of extra_words and exclude_words. Returns
    a DataFrame with columns listing the title, body of text, 
    stemmed text, lemmatized text, and the cleaned, tokenized, 
    and lemmatized text with stopwords removed. 
    '''
    
    #Creating a column named 'cleaned' that has the following cleaning functions applied to the text:
    df['cleaned'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    #Creating a separate column that has the stemmed version of the cleaned text:
    df['stemmed'] = df['cleaned'].apply(stem)
    
    #Creating a separate column that has the lemmatized version of the cleaned text:
    df['lemmatized'] = df['cleaned'].apply(lemmatize)
    
    #returning the DataFrame's new columns:
    return df[['title', column, 'cleaned', 'stemmed', 'lemmatized']]

In [34]:
prep_article_data(news_df, 'contents')

Unnamed: 0,title,contents,cleaned,stemmed,lemmatized
0,Rupee hits 80 per US dollar for the first time...,The Indian rupee touched 80 per US dollar for ...,indian rupee touched 80 per us dollar first ti...,indian rupe touch 80 per us dollar first time ...,"[indian, rupee, touched, 80, per, u, dollar, f..."
1,ED arrests ex-Mumbai Police chief Sanjay Pande...,The Enforcement Directorate (ED) on Tuesday ar...,enforcement directorate ed tuesday arrested fo...,enforc director ed tuesday arrest former mumba...,"[enforcement, directorate, ed, tuesday, arrest..."
2,Who are now the world's 10 richest people as A...,Gautam Adani has overtaken Bill Gates to becom...,gautam adani overtaken bill gates become world...,gautam adani overtaken bill gate becom world '...,"[gautam, adani, overtaken, bill, gate, become,..."
3,Gautam Adani overtakes Bill Gates to become wo...,Gautam Adani has overtaken Bill Gates to becom...,gautam adani overtaken bill gates become world...,gautam adani overtaken bill gate becom world '...,"[gautam, adani, overtaken, bill, gate, become,..."
4,List of items exempt from GST when sold loose ...,Amid criticism over pre-packaged and pre-label...,amid criticism prepackaged prelabelled food it...,amid critic prepackag prelabel food item get c...,"[amid, criticism, prepackaged, prelabelled, fo..."
...,...,...,...,...,...
20,Joe Russo arrives in Mumbai ahead of 'The Gray...,Filmmaker Joe Russo has arrived in Mumbai ahea...,filmmaker joe russo arrived mumbai ahead premi...,filmmak joe russo arriv mumbai ahead premier u...,"[filmmaker, joe, russo, arrived, mumbai, ahead..."
21,I salute Sushmita Sen for living life on her o...,Filmmaker Mahesh Bhatt defended Sushmita Sen a...,filmmaker mahesh bhatt defended sushmita sen t...,filmmak mahesh bhatt defend sushmita sen troll...,"[filmmaker, mahesh, bhatt, defended, sushmita,..."
22,Fans erect 75-feet banner of 'Liger' actor Vij...,"Fans of actor Vijay Deverakonda, who will be s...",fans actor vijay deverakonda seen film ' liger...,fan actor vijay deverakonda seen film ' liger ...,"[fan, actor, vijay, deverakonda, seen, film, '..."
23,Told Salman 'You won't look back after this fi...,Discussing the film 'Partner' where he starred...,discussing film ' partner ' starred actor salm...,discuss film ' partner ' star actor salman kha...,"[discussing, film, ', partner, ', starred, act..."


## 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?