In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

# Plan for parsing the text data:

- Convert text to all lower case for normalcy.


- Remove any accented characters, non-ASCII characters.


- Remove special characters.


- Stem or lemmatize the words.


- Remove stopwords.


- Store the clean text and the original text for use in future notebooks.

# Acquire df

In [2]:
original = acquire.get_article_text()
original.dtypes

Unnamed: 0     int64
body          object
title         object
dtype: object

## Drop Unnamed: 0 column

In [3]:
prepped = original.copy()
prepped = prepped.drop(columns='Unnamed: 0')
prepped.dtypes

body     object
title    object
dtype: object

## Normalize Using .lower() and .strip()

In [4]:
prepped['body'] = prepped.body.str.lower()
prepped['body'] = prepped.body.str.strip()

In [5]:
prepped

Unnamed: 0,body,title
0,the rumors are true! the time has arrived. cod...,Codeup’s Data Science Career Accelerator is He...
1,by dimitri antoniou and maggie giust\ndata sci...,Data Science Myths - Codeup
2,"by dimitri antoniou\na week ago, codeup launch...",Data Science VS Data Analytics: What’s The Dif...
3,10 tips to crush it at the sa tech job fair\ns...,10 Tips to Crush It at the SA Tech Job Fair - ...
4,competitor bootcamps are closing. is the model...,Competitor Bootcamps Are Closing. Is the Model...


## Normalize Function Using NFKD

In [6]:
def normalize(string):
    return unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    

## Replace Special Characters

In [7]:
def remove_special_characters(string):
    return re.sub(r"[^a-z0-9'\s]", '', string)

In [8]:
prepped.body = prepped.body.apply(remove_special_characters)

In [9]:
prepped

Unnamed: 0,body,title
0,the rumors are true the time has arrived codeu...,Codeup’s Data Science Career Accelerator is He...
1,by dimitri antoniou and maggie giust\ndata sci...,Data Science Myths - Codeup
2,by dimitri antoniou\na week ago codeup launche...,Data Science VS Data Analytics: What’s The Dif...
3,10 tips to crush it at the sa tech job fair\ns...,10 Tips to Crush It at the SA Tech Job Fair - ...
4,competitor bootcamps are closing is the model ...,Competitor Bootcamps Are Closing. Is the Model...


## Big Cleaning Function

In [10]:
prepped = original.copy()
prepped = prepped.drop(columns='Unnamed: 0')

In [11]:
def basic_clean(s):
    s = s.lower()
    s = s.strip()
    s= unicodedata.normalize('NFKD', s)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    s = re.sub(r"[^a-z0-9'\s]", '', s)
    return s

In [12]:
prepped.body = prepped.body.apply(basic_clean)

In [13]:
prepped

Unnamed: 0,body,title
0,the rumors are true the time has arrived codeu...,Codeup’s Data Science Career Accelerator is He...
1,by dimitri antoniou and maggie giust\ndata sci...,Data Science Myths - Codeup
2,by dimitri antoniou\na week ago codeup launche...,Data Science VS Data Analytics: What’s The Dif...
3,10 tips to crush it at the sa tech job fair\ns...,10 Tips to Crush It at the SA Tech Job Fair - ...
4,competitor bootcamps are closing is the model ...,Competitor Bootcamps Are Closing. Is the Model...


# Tokenize Function

In [14]:
def tokenize(s):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(s, return_str=True)

In [15]:
prepped.body.apply(tokenize)

0    the rumors are true the time has arrived codeu...
1    by dimitri antoniou and maggie giust\ndata sci...
2    by dimitri antoniou\na week ago codeup launche...
3    10 tips to crush it at the sa tech job fair\ns...
4    competitor bootcamps are closing is the model ...
Name: body, dtype: object

# Stemming and Lemmatization

In [24]:
def stem(s):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in s.split()]
    string_of_stems = ' '.join(stems)
    return string_of_stems

In [25]:
def lemmatize(s):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in s.split()]
    string_of_lemmas = ' '.join(lemmas)
    return string_of_lemmas

# Remove Stopwords

In [38]:
def remove_stopwords(s, extra_words=[], exclude_words=[]):
    s = tokenize(s)
    
    words = s.split()
    stopword_list = stopwords.words('english')
    
    stopword_list = set(stopword_list) - set(exclude_words)
    
    stopword_list = stopword_list.union(set(extra_words))
    
    filtered_words = [w for w in words if w not in stopword_list]
    final_string = ' '.join(filtered_words)
    return final_string

# Define a function named prep_article that takes in the df and returns a df:

In [39]:
prepped = original.copy()
prepped = prepped.drop(columns='Unnamed: 0')

In [40]:
def prep_articles(df):
    df['original'] = df.body
    df['stemmed'] = df.body.apply(basic_clean).apply(stem)
    df['lemmatized'] = df.body.apply(basic_clean).apply(lemmatize)
    df['clean'] = df.body.apply(basic_clean).apply(remove_stopwords)
    df.drop(columns=['body'], inplace=True)
    return df

In [41]:
prep_articles(prepped)

Unnamed: 0,title,original,stemmed,lemmatized,clean
0,Codeup’s Data Science Career Accelerator is He...,\nThe rumors are true! The time has arrived. C...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumors true time arrived codeup officially ope...
1,Data Science Myths - Codeup,\nBy Dimitri Antoniou and Maggie Giust\nData S...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...,dimitri antoniou maggie giust data science big...
2,Data Science VS Data Analytics: What’s The Dif...,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...,dimitri antoniou week ago codeup launched imme...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair sa ...,10 tips crush sa tech job fair sa tech job fai...
4,Competitor Bootcamps Are Closing. Is the Model...,\nCompetitor Bootcamps Are Closing. Is the Mod...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...
