In [2]:
import pandas as pd
import numpy as np

import os
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from acquire import get_news_articles

## Exercise 1
Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

Normalization is when you perform a series of tasks like making all text lowercase, removing punctuation, expanding contractions, removing anything that's not an ASCII character, etc.

In [3]:
df = get_news_articles(cached=True)
df.head()

Unnamed: 0,topic,title,author,content
0,business,"Lakshmi Vilas Bank withdrawals capped at ₹25,0...",Pragya Swastik,The Centre has imposed a 30-day moratorium on ...
1,business,Shutting Delhi markets may prove counterproduc...,Sakshita Khosla,Traders' body CAIT on Tuesday said a proposal ...
2,business,Pfizer shares drop 4.5% as Moderna says its va...,Krishna Veera Vanamali,Pfizer’s shares fell as much as 4.5% on Monday...
3,business,How does Moderna's COVID-19 vaccine candidate ...,Pragya Swastik,Moderna's initial results of late-stage trial ...
4,business,"Musk gets $15bn richer in 2 hours, becomes wor...",Krishna Veera Vanamali,Billionaire Elon Musk added $15 billion to his...


In [4]:
articles = df.content
print(type(articles))
articles[:5]

<class 'pandas.core.series.Series'>


0    The Centre has imposed a 30-day moratorium on ...
1    Traders' body CAIT on Tuesday said a proposal ...
2    Pfizer’s shares fell as much as 4.5% on Monday...
3    Moderna's initial results of late-stage trial ...
4    Billionaire Elon Musk added $15 billion to his...
Name: content, dtype: object

In [5]:
article = articles[0]
print(type(article))
article

<class 'str'>


'The Centre has imposed a 30-day moratorium on Lakshmi Vilas Bank effective from Tuesday. A withdrawal limit of ₹25,000 with certain exceptions for unforeseen expenses has been imposed for depositors. The RBI said, "The financial position of the bank has undergone a steady decline with continuous losses over the last three years."'

In [7]:
# Make all characters in string lowercase.
# string = string.lower()

article.lower()

'the centre has imposed a 30-day moratorium on lakshmi vilas bank effective from tuesday. a withdrawal limit of ₹25,000 with certain exceptions for unforeseen expenses has been imposed for depositors. the rbi said, "the financial position of the bank has undergone a steady decline with continuous losses over the last three years."'

In [8]:
# # Remove inconsistencies in unicode character encoding.
# string = unicodedata.normalize(form, unistr)
# # Convert string to ASCII character set and drop non-ASCII characters.
# string = string.encode('ascii', 'ignore')
# # Convert the bytes back into a string object.
# string = string.decode('utf-8', 'ignore')


# I have to reassign to my variable if I want to save the changes.

unicodedata.normalize('NFKC', article).encode('ascii', 'ignore').decode('utf-8', 'ignore')

'The Centre has imposed a 30-day moratorium on Lakshmi Vilas Bank effective from Tuesday. A withdrawal limit of 25,000 with certain exceptions for unforeseen expenses has been imposed for depositors. The RBI said, "The financial position of the bank has undergone a steady decline with continuous losses over the last three years."'

In [9]:
# # Remove characters that are not letters, underscores, or spaces.
# string = re.sub(r'[^\w\s]', '', string)

# # Remove characters that are not letters, numbers, single quotes, or spaces.
# string = re.sub(r"[^a-z0-9'\s]", '', string)

re.sub(r'[^\w\s]', '', article)

'The Centre has imposed a 30day moratorium on Lakshmi Vilas Bank effective from Tuesday A withdrawal limit of 25000 with certain exceptions for unforeseen expenses has been imposed for depositors The RBI said The financial position of the bank has undergone a steady decline with continuous losses over the last three years'

In [10]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKC', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [11]:
basic_clean(article)

'the centre has imposed a 30day moratorium on lakshmi vilas bank effective from tuesday a withdrawal limit of 25000 with certain exceptions for unforeseen expenses has been imposed for depositors the rbi said the financial position of the bank has undergone a steady decline with continuous losses over the last three years'

## Exercise 2
Define a function named tokenize. It should take in a string and tokenize all the words in the string.

Tokenization - is when you split larger strings of text into smaller pieces or tokens by setting a boundary. You might chunk a sentence into words using a space as a boundary or a paragraph into sentences using punctuation as a boundary.

In [12]:
# # Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer on my string; assign to variable to save changes

tokenizer.tokenize(article, return_str=True)

'The Centre has imposed a 30-day moratorium on Lakshmi Vilas Bank effective from Tuesday. A withdrawal limit of ₹ 25,000 with certain exceptions for unforeseen expenses has been imposed for depositors. The RBI said , " The financial position of the bank has undergone a steady decline with continuous losses over the last three years . "'

In [13]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string

In [14]:
tokenize(article)

'The Centre has imposed a 30-day moratorium on Lakshmi Vilas Bank effective from Tuesday. A withdrawal limit of ₹ 25,000 with certain exceptions for unforeseen expenses has been imposed for depositors. The RBI said , " The financial position of the bank has undergone a steady decline with continuous losses over the last three years . "'

## Exercise 3
Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

Stemming is when you reduce related words in your text to their common stem. It can make it easier when you are searching for a particular word in your text to search for their common stem rather than every form of the word. Stemmers aren't that sophisticated in the way they chop off word endings at their common stems; Spacy, another python NLP library, doesn't even include a stemmer in their library. Spacy only offers the more sophisticated lemmatizer, which we will look at in NLTK next.


In [15]:
# Create porter stemmer.

ps = nltk.porter.PorterStemmer()

In [16]:
# Apply the stemmer to each word in our string.

stems = [ps.stem(word) for word in article.split()]
stems[:10]

['the',
 'centr',
 'ha',
 'impos',
 'a',
 '30-day',
 'moratorium',
 'on',
 'lakshmi',
 'vila']

In [17]:
# Join our lists of words into a string again; assign to a variable to save changes

' '.join(stems)

'the centr ha impos a 30-day moratorium on lakshmi vila bank effect from tuesday. A withdraw limit of ₹25,000 with certain except for unforeseen expens ha been impos for depositors. the rbi said, "the financi posit of the bank ha undergon a steadi declin with continu loss over the last three years."'

In [18]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [19]:
stem(article)

'the centr ha impos a 30-day moratorium on lakshmi vila bank effect from tuesday. A withdraw limit of ₹25,000 with certain except for unforeseen expens ha been impos for depositors. the rbi said, "the financi posit of the bank ha undergon a steadi declin with continu loss over the last three years."'

## Exercise 4
Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

Lemmatization - is when you reduce related words in your text to their lemma or word base by applying a morphological analysis to your text. Like stemming, this is done to reduce the number of forms you have of the same word, so they can be analyzed as a single item. While stemming might create tokens that are not actually words anymore after they have been chopped off at their base, lemmatization will leave you with real words. A drawback to lemmatization is that it takes longer than stemming; you can try both to see which gives you better results as you analyze a given text.


In [20]:
# Create the Lemmatizer.

wnl = nltk.stem.WordNetLemmatizer()

In [21]:
# Check lemmatizer. It works.

wnl.lemmatize('Calls')

'Calls'

In [22]:
# Use the lemmatizer on each word in the list of words we created by using split.

lemmas = [wnl.lemmatize(word) for word in article.split()]
lemmas[:10]

['The',
 'Centre',
 'ha',
 'imposed',
 'a',
 '30-day',
 'moratorium',
 'on',
 'Lakshmi',
 'Vilas']

In [23]:
# Join our list of words into a string again; assign to a variable to save changes.

' '.join(lemmas)

'The Centre ha imposed a 30-day moratorium on Lakshmi Vilas Bank effective from Tuesday. A withdrawal limit of ₹25,000 with certain exception for unforeseen expense ha been imposed for depositors. The RBI said, "The financial position of the bank ha undergone a steady decline with continuous loss over the last three years."'

In [25]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [26]:
lemmatize(article)

'The Centre ha imposed a 30-day moratorium on Lakshmi Vilas Bank effective from Tuesday. A withdrawal limit of ₹25,000 with certain exception for unforeseen expense ha been imposed for depositors. The RBI said, "The financial position of the bank ha undergone a steady decline with continuous loss over the last three years."'

## Exercise 5
Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

Stopwords - are words which are filtered out during the preparation of your text for analyzation and modeling. Stopwords are those that offer little to the meaning of your text and are basically just adding noise to your analysis. Or, as Ryan Orsinger would say, "Stopwords aren't the real story of the document." Words such as 'the', 'and', 'a', and the like can be removed, so you can better focus on the good stuff.

In [27]:
# Create the list of stopwords.

stopword_list = stopwords.words('english')
len(stopword_list)

179

In [28]:
# Split words in lemmatized column.

words = article.split()
words[:10]

['The',
 'Centre',
 'has',
 'imposed',
 'a',
 '30-day',
 'moratorium',
 'on',
 'Lakshmi',
 'Vilas']

In [29]:
# Create a list of words from my string with stopwords removed and assign to variable.

filtered_words = [word for word in words if word not in stopword_list]
filtered_words[:10]

['The',
 'Centre',
 'imposed',
 '30-day',
 'moratorium',
 'Lakshmi',
 'Vilas',
 'Bank',
 'effective',
 'Tuesday.']

In [30]:
# Join words in the list back into strings; assign to a variable to keep changes.

' '.join(filtered_words)

'The Centre imposed 30-day moratorium Lakshmi Vilas Bank effective Tuesday. A withdrawal limit ₹25,000 certain exceptions unforeseen expenses imposed depositors. The RBI said, "The financial position bank undergone steady decline continuous losses last three years."'

In [31]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)

    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [32]:
remove_stopwords(article)

'The Centre imposed 30-day moratorium Lakshmi Vilas Bank effective Tuesday. A withdrawal limit ₹25,000 certain exceptions unforeseen expenses imposed depositors. The RBI said, "The financial position bank undergone steady decline continuous losses last three years."'

In [39]:
# Test my function for adding extra_words to my stopword list and removing exclude_words. 

remove_stopwords(article, extra_words=['Centre'], exclude_words=['i', 'me'])

'The imposed 30-day moratorium Lakshmi Vilas Bank effective Tuesday. A withdrawal limit ₹25,000 certain exceptions unforeseen expenses imposed depositors. The RBI said, "The financial position bank undergone steady decline continuous losses last three years."'

## Exercise 6
Define a function named prep_article that takes in the dictionary representing an article and returns a dictionary that looks like this:

{
    'title': 'the original title'.
    'original': original,
    'stemmed': article_stemmed,
    'lemmatized': article_lemmatized,
    'clean': article_without_stopwords
}
Note that if the orignal dictionary has a title property, it should remain unchanged (same goes for the category property).

In [40]:
# I'm checking my code before I throw it in my function; always check it first!

df['content'].apply(basic_clean)\
             .apply(tokenize)\
             .apply(remove_stopwords)\
             .apply(lemmatize)

0     centre imposed 30day moratorium lakshmi vila b...
1     trader body cait tuesday said proposal impose ...
2     pfizers share fell much 45 monday rival modern...
3     modernas initial result latestage trial show c...
4     billionaire elon musk added 15 billion wealth ...
                            ...                        
94    recent interview singer amaal mallik mentioned...
95    actor suriya sivakumar said belief indian film...
96    actor aditya roy kapur start shooting upcoming...
97    sharing picture set upcoming film jug jugg jee...
98    actor sonu sood appointed state icon punjab el...
Name: content, Length: 99, dtype: object

In [41]:
df = get_news_articles(cached=True)
df.head(2)

Unnamed: 0,topic,title,author,content
0,business,"Lakshmi Vilas Bank withdrawals capped at ₹25,0...",Pragya Swastik,The Centre has imposed a 30-day moratorium on ...
1,business,Shutting Delhi markets may prove counterproduc...,Sakshita Khosla,Traders' body CAIT on Tuesday said a proposal ...


In [42]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df[['title', column, 'stemmed', 'lemmatized', 'clean']]

In [43]:
df = prep_article_data(df, 'content')
df.head()

Unnamed: 0,title,content,stemmed,lemmatized,clean
0,"Lakshmi Vilas Bank withdrawals capped at ₹25,0...",The Centre has imposed a 30-day moratorium on ...,the centr ha impos a 30day moratorium on laksh...,the centre ha imposed a 30day moratorium on la...,centre imposed 30day moratorium lakshmi vila b...
1,Shutting Delhi markets may prove counterproduc...,Traders' body CAIT on Tuesday said a proposal ...,trader bodi cait on tuesday said a propos to i...,trader body cait on tuesday said a proposal to...,trader body cait tuesday said proposal impose ...
2,Pfizer shares drop 4.5% as Moderna says its va...,Pfizer’s shares fell as much as 4.5% on Monday...,pfizer share fell as much as 45 on monday afte...,pfizers share fell a much a 45 on monday after...,pfizers share fell much 45 monday rival modern...
3,How does Moderna's COVID-19 vaccine candidate ...,Moderna's initial results of late-stage trial ...,moderna initi result of latestag trial show it...,modernas initial result of latestage trial sho...,modernas initial result latestage trial show c...
4,"Musk gets $15bn richer in 2 hours, becomes wor...",Billionaire Elon Musk added $15 billion to his...,billionair elon musk ad 15 billion to hi wealt...,billionaire elon musk added 15 billion to his ...,billionaire elon musk added 15 billion wealth ...


In [44]:
prep_article_data(df, 'content', extra_words=['centre', 'bank'], exclude_words=['i', 'me'])

Unnamed: 0,title,content,stemmed,lemmatized,clean
0,"Lakshmi Vilas Bank withdrawals capped at ₹25,0...",The Centre has imposed a 30-day moratorium on ...,the centr ha impos a 30day moratorium on laksh...,the centre ha imposed a 30day moratorium on la...,imposed 30day moratorium lakshmi vila effectiv...
1,Shutting Delhi markets may prove counterproduc...,Traders' body CAIT on Tuesday said a proposal ...,trader bodi cait on tuesday said a propos to i...,trader body cait on tuesday said a proposal to...,trader body cait tuesday said proposal impose ...
2,Pfizer shares drop 4.5% as Moderna says its va...,Pfizer’s shares fell as much as 4.5% on Monday...,pfizer share fell as much as 45 on monday afte...,pfizers share fell a much a 45 on monday after...,pfizers share fell much 45 monday rival modern...
3,How does Moderna's COVID-19 vaccine candidate ...,Moderna's initial results of late-stage trial ...,moderna initi result of latestag trial show it...,modernas initial result of latestage trial sho...,modernas initial result latestage trial show c...
4,"Musk gets $15bn richer in 2 hours, becomes wor...",Billionaire Elon Musk added $15 billion to his...,billionair elon musk ad 15 billion to hi wealt...,billionaire elon musk added 15 billion to his ...,billionaire elon musk added 15 billion wealth ...
...,...,...,...,...,...
94,"I'm a romantic at heart, that influences my so...","During a recent interview, singer Amaal Mallik...",dure a recent interview singer amaal mallik me...,during a recent interview singer amaal mallik ...,recent interview singer amaal mallik mentioned...
95,"Heroes can be larger than life, but emotions h...",Actor Suriya Sivakumar said he believes that I...,actor suriya sivakumar said he believ that ind...,actor suriya sivakumar said he belief that ind...,actor suriya sivakumar said belief indian film...
96,Aditya to start shooting for upcoming action f...,Actor Aditya Roy Kapur will start shooting for...,actor aditya roy kapur will start shoot for hi...,actor aditya roy kapur will start shooting for...,actor aditya roy kapur start shooting upcoming...
97,Feeling a little scared: Neetu on shooting for...,Sharing a picture from the sets of her upcomin...,share a pictur from the set of her upcom film ...,sharing a picture from the set of her upcoming...,sharing picture set upcoming film jug jugg jee...
