In [1]:
import pandas as pd
import numpy as np

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import acquire


In [2]:
original = "The price of a domestic 14.2 kg-LPG cylinder has been increased by \u20b950/unit and it will now cost \
\u20b91,103/unit in Delhi. The price of a commercial 19 kg-LPG cylinder has also been hiked by \u20b9350.50/unit, \
raising the cost to \u20b92,119.50/unit in the national capital. This is the second time the price of commercial LPG\
cylinders has been hiked this year."

#### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
def basic_clean(original):
    original = original.lower()
    original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
    original = re.sub(r'[^a-z0-9\s]', '', original)
    return original
    

In [4]:
cleaned = basic_clean(original)
cleaned

'the price of a domestic 142 kglpg cylinder has been increased by 50unit and it will now cost 1103unit in delhi the price of a commercial 19 kglpg cylinder has also been hiked by 35050unit raising the cost to 211950unit in the national capital this is the second time the price of commercial lpgcylinders has been hiked this year'

#### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(original):
    tokenize = nltk.tokenize.ToktokTokenizer()
    original = tokenize.tokenize(original)
    return original

In [6]:
tokenized = tokenize(cleaned)
tokenized

['the',
 'price',
 'of',
 'a',
 'domestic',
 '142',
 'kglpg',
 'cylinder',
 'has',
 'been',
 'increased',
 'by',
 '50unit',
 'and',
 'it',
 'will',
 'now',
 'cost',
 '1103unit',
 'in',
 'delhi',
 'the',
 'price',
 'of',
 'a',
 'commercial',
 '19',
 'kglpg',
 'cylinder',
 'has',
 'also',
 'been',
 'hiked',
 'by',
 '35050unit',
 'raising',
 'the',
 'cost',
 'to',
 '211950unit',
 'in',
 'the',
 'national',
 'capital',
 'this',
 'is',
 'the',
 'second',
 'time',
 'the',
 'price',
 'of',
 'commercial',
 'lpgcylinders',
 'has',
 'been',
 'hiked',
 'this',
 'year']

#### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(original):
    ps = nltk.porter.PorterStemmer()
    stem = [ps.stem(word) for word in original]
    return ''.join(stem)

In [8]:
stem(cleaned)

'the price of a domestic 142 kglpg cylinder has been increased by 50unit and it will now cost 1103unit in delhi the price of a commercial 19 kglpg cylinder has also been hiked by 35050unit raising the cost to 211950unit in the national capital this is the second time the price of commercial lpgcylinders has been hiked this year'

#### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(original):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word)for word in original]
    return ''.join(lemmas)


In [10]:
lemmatize(cleaned)

'the price of a domestic 142 kglpg cylinder has been increased by 50unit and it will now cost 1103unit in delhi the price of a commercial 19 kglpg cylinder has also been hiked by 35050unit raising the cost to 211950unit in the national capital this is the second time the price of commercial lpgcylinders has been hiked this year'

#### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [11]:
def remove_stopwords(original, extra_words=[], exclude_words=[]):
    stopword_list = stopwords.words('english')
#     stopword_list.remove('the') # takes in one at time
#     stopword_list.append('jpt') 
#     stopword_list.extend(['jpt','sd']) 
    filtered_words = [w for w in original.split()if w not in stopword_list]
    return filtered_words
    
    

In [12]:
# stopword_list = stopwords.words('english')
# stopword_list 

In [13]:
remove_stopwords(cleaned)

['price',
 'domestic',
 '142',
 'kglpg',
 'cylinder',
 'increased',
 '50unit',
 'cost',
 '1103unit',
 'delhi',
 'price',
 'commercial',
 '19',
 'kglpg',
 'cylinder',
 'also',
 'hiked',
 '35050unit',
 'raising',
 'cost',
 '211950unit',
 'national',
 'capital',
 'second',
 'time',
 'price',
 'commercial',
 'lpgcylinders',
 'hiked',
 'year']

#### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [14]:
news_df = pd.DataFrame(acquire.get_news_articles(['business', 'sports', 'technology', 'entertainment']))

In [15]:
news_df.head()

Unnamed: 0,category,title,content
0,business,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...
1,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...
2,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...
3,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...
4,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...


#### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [16]:
codeup_df = pd.DataFrame(acquire.get_blog_articles())

In [17]:
codeup_df.head()

Unnamed: 0,title,content
0,Black Excellence in Tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...
1,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...
2,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...
3,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...
4,Coding Bootcamp or Self-Learning? Which is Bes...,\nIf you’re interested in embarking on a caree...


#### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
clean to hold the normalized and tokenized original with the stopwords removed.
stemmed to hold the stemmed version of the cleaned data.
lemmatized to hold the lemmatized version of the cleaned data.

In [18]:
codeup_df.rename(columns={'content': 'original'}, inplace=True)

In [19]:
codeup_df['clean'] = codeup_df['original'].apply(basic_clean)

In [20]:
codeup_df['stemmed'] = codeup_df['clean'].apply(stem)

In [21]:
codeup_df['lemmatized'] = codeup_df['clean'].apply(lemmatize)

In [22]:
codeup_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Black Excellence in Tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...
1,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...
2,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...
3,Black excellence in tech: Panelist Spotlight –...,\nBlack excellence in tech: Panelist Spotlight...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...,\nblack excellence in tech panelist spotlight ...
4,Coding Bootcamp or Self-Learning? Which is Bes...,\nIf you’re interested in embarking on a caree...,\nif youre interested in embarking on a career...,\nif youre interested in embarking on a career...,\nif youre interested in embarking on a career...


In [27]:
news_df.rename(columns={'content': 'original'}, inplace=True)

In [28]:
news_df['clean'] = news_df['original'].apply(basic_clean)

In [29]:
news_df['stemmed'] = news_df['clean'].apply(stem)

In [30]:
news_df['lemmatized'] = news_df['clean'].apply(lemmatize)

In [31]:
news_df.head()

Unnamed: 0,category,title,original,clean,stemmed,lemmatized
0,business,All Adani stocks end higher for the first time...,All 10 Adani Group stocks closed higher on Wed...,all 10 adani group stocks closed higher on wed...,all 10 adani group stocks closed higher on wed...,all 10 adani group stocks closed higher on wed...
1,business,"Bill Gates meets Ratan Tata, N Chandrasekaran;...",Microsoft Co-founder Bill Gates met with Tata ...,microsoft cofounder bill gates met with tata s...,microsoft cofounder bill gates met with tata s...,microsoft cofounder bill gates met with tata s...
2,business,SoftBank sells shares worth ₹954 crore in logi...,SoftBank sold shares worth ₹954 crore in logis...,softbank sold shares worth 954 crore in logist...,softbank sold shares worth 954 crore in logist...,softbank sold shares worth 954 crore in logist...
3,business,Smriti Irani's 2011 tweet on LPG price hike re...,Hours after the central government raised the ...,hours after the central government raised the ...,hours after the central government raised the ...,hours after the central government raised the ...
4,business,"Indian-Americans Renjen, Subramaniam to be mem...",Indian-Americans Punit Renjen and Rajesh Subra...,indianamericans punit renjen and rajesh subram...,indianamericans punit renjen and rajesh subram...,indianamericans punit renjen and rajesh subram...


#### 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?

In [None]:
lemmatized

- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?

In [None]:
lemmatized

- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

In [None]:
stemmed