In [3]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

from acquire import get_blog_articles, get_all_urls

In [4]:
def basic_clean(original):
    analyzed_corpus = original.lower()
    

In [5]:
orginal = acquire.get_blog_articles(urls=get_all_urls(), cache=True)

In [27]:
orginal.content.sum()



In [None]:
original.type

In [25]:
def pre_prep(unclean_str):
    clean_str = unclean_str.lower()
    clean_str = re.sub(r'[\s]+', ' ', clean_str)
    clean_str = clean_str.strip()
    return clean_str

def remove_non_ascii(unclean_str):  
    clean_str =     unicodedata.normalize('NFKD', unclean_str)\
                        .encode('ascii', 'ignore')\
                        .decode('utf-8', 'ignore')
    return clean_str

def remove_special_characters(unclean_str):
    clean_str = re.sub(r"[^a-z'\s]", '', unclean_str)
    return clean_str

def tokenize(unclean_str):
    tokenizer = ToktokTokenizer()
    clean_str = tokenizer.tokenize(unclean_str, return_str=True)
    return clean_str

def lemmatize(unclean_str):
    wn1 = nltk.stem.WordNetLemmatizer()
    clean_str = ' '.join([wn1.lemmatize(word) for word in unclean_str.split()])
    return clean_str

def stem(unclean_str):
    ps = nltk.porter.PorterStemmer()
    clean_str = ' '.join([ps.stem(word) for word in unclean_str.split()])
    return clean_str

def remove_stopwords(unclean_str, extra_words = [], exclude_words = []):
    sw_list = stopwords.words('english')
    for add_word in extra_words:
        sw_list.append(add_word)
    for rm_word in exclude_words:
        sw_list.remove(rm_word)
    unclean_str = tokenize(unclean_str).split()
    clean_str = ' '.join([word for word in unclean_str if word not in sw_list])
    return clean_str

def basic_clean(df, stem_or_lem = 'lemmatize'):
    for col in df:
        df[col] = df[col].apply(pre_prep)
        df[col] = df[col].apply(remove_non_ascii) 
        df[col] = df[col].apply(remove_special_characters) 
        df[col] = df[col].apply(tokenize)
        if stem_or_lem == 'lemmatize':
            df[col] = df[col].apply(lemmatize)
        elif stem_or_lem == 'stem':
            df[col] = df[col].apply(stem)
        df[col] = df[col].apply(remove_stopwords)
    return df

In [26]:
stem_or_lem = 'lemmatize'
df = basic_clean(orginal)

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/Users/chasethompson/nltk_data'
    - '/usr/local/anaconda3/nltk_data'
    - '/usr/local/anaconda3/share/nltk_data'
    - '/usr/local/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
from requests import get
from bs4 import BeautifulSoup
import numpy as np
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import re
import pandas as pd
from tabulate import tabulate

# ~~~~~ Prepare The Data ~~~~~ #

#--------------------------#
#   Prepare NLP Data Data  #
#--------------------------#

def basic_clean(df, col):
    '''
    This function takes in a df and a string for a column and
    returns the df with a new column named 'basic_clean' with the
    passed column text normalized.
    '''
    df['basic_clean'] = df[col].str.lower()\
                    .replace(r'[^\w\s]', '', regex=True)\
                    .str.normalize('NFKC')\
                    .str.encode('ascii', 'ignore')\
                    .str.decode('utf-8', 'ignore')
    return df

def tokenize(df, col):
    '''
    This function takes in a df and a string for a column and
    returns a df with a new column named 'clean_tokes' with the
    passed column text tokenized and in a list.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    df['clean_tokes'] = df[col].apply(tokenizer.tokenize)
    return df

def stem(df, col):
    '''
    This function takes in a df and a string for a column name and
    returns a df with a new column named 'stemmed'.
    '''
    # Create porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    # Stem each token from our clean_tokes Series of lists
    stems = df[col].apply(lambda row: [ps.stem(word) for word in row])
    
    # Join our cleaned, stemmed lists of words back into sentences
    df['stemmed'] = stems.str.join(' ')
    
    return df

def lemmatize(df, col):
    '''
    This function takes in a df and a string for column name and
    returns the original df with a new column called 'lemmatized'.
    '''
    # Create the lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Lemmatize each token from our clean_tokes Series of lists
    lemmas = df[col].apply(lambda row: [wnl.lemmatize(word) for word in row])
    
    # Join the cleaned and lemmatized tokens back into sentences
    df['lemmatized'] = lemmas.str.join(' ')
    return df

def remove_stopwords(df, col):
    '''
    This function takes in a df and a string for column name and 
    returns the df with a new column named 'clean' with stopwords removed.
    '''
    # Create stopword_list
    stopword_list = stopwords.words('english')
    
    # Split words in column
    words = df[col].str.split()
    
    # Check each word in each row of the column against stopword_list and return only those that are not in list
    filtered_words = words.apply(lambda row: [word for word in row if word not in stopword_list])
    
    # Create new column of words that have stopwords removed
    df['clean_' + col] = filtered_words.str.join(' ')
    
    return df

def prep_article_data(df):
    '''
    This function takes in the news articles df and
    returns the df with original columns plus cleaned
    and lemmatized content without stopwords.
    '''
    # Do basic clean on article content
    df = basic_clean(df, 'content')
    
    # Tokenize clean article content
    df = tokenize(df, 'basic_clean')
    
    # Stem cleaned and tokenized article content
    df = stem(df, 'clean_tokes')
    
    # Remove stopwords from Lemmatized article content
    df = remove_stopwords(df, 'stemmed')
    
    # Lemmatize cleaned and tokenized article content
    df = lemmatize(df, 'clean_tokes')
    
    # Remove stopwords from Lemmatized article content
    df = remove_stopwords(df, 'lemmatized')
    
    return df[['topic', 'title', 'author', 'content', 'clean_stemmed', 'clean_lemmatized']]