# Data Preparation

In [14]:
import pandas as pd
import numpy as np
import nltk
import unicodedata
import re
from nltk.corpus import stopwords
from requests import get
from bs4 import BeautifulSoup
import os

## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

Lowercase everything
Normalize unicode characters
Replace anything that is not a letter, number, whitespace or a single quote.

In [6]:
def basic_clean(text):
    '''This function takes in a string and makes it lowercase, normalizes unicode characters, and
        replaces anything that is not a letter, number, whitespace or a single quote.'''
    
    #lowercase
    text = text.lower()
    
    #normalize unicode characters
    text = unicodedata.normalize('NKFD', text).encode('ascii', 'ignore').decode('utf-8')
    
    #replace anything not a letter, number, whitespace, or single quote
    text = re.sub(r'[^a-z0-9\s]','', text)
    
    return text

## 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [7]:
def tokenize(text):
    '''This function takes in a string and returns it tokenized.'''
    
    #create the tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    
    #tokenize the text
    text = tokenize.tokenize(text)
    
    return text

## 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [8]:
def stem(text):
    '''This function takes in a string and returns it after applying the Porter Stemmer.'''
    
    #create the stemmer
    ps = nltk.porter.PorterStemmer()
    
    #apply the stemmer to all words in the string
    text = [ps.stem(word) for word in text]
    
    return text

## 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(text):
    '''This function takes in a string and returns it after applying WordNet Lemmatizer.'''
    
    #create the lemmatizer
    wnl = ntlk.stem.WordNetLemmatizer()
    
    #apply the lemmatizer
    text = [wnl.lemmatize(word) for word in text]
    
    return text

## 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [11]:
def remove_stopwords(text, extra_words=None, exclude_words=None):
    '''This function takes in a string and returns the text after removing all stopwords using nltk stopwords list.
    There are two optional arguments: extra_words add any additional stop words to the list,
                                      exclude_words remove words from the stop list so they will not be removed. '''
    
    stopwords = stopwords.words('english')
    
    if extra_words != None:
        
        stopwords = stopwords.append(extra_words)
        
    if exclude_words != None:
        
        stopwords = stopwords.remove(exclude_words)
        
    text = [word for word in text if word not in stopwords]
    
    return text
    

## 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [15]:
def get_blog_articles():
    '''This function gets five individual urls and
    returns a list of dictionaries containing the title and content of each.'''
    
    #create an empty list to append to
    blog_dict_list = []
    
    #define the urls
    url_one = 'https://codeup.edu/featured/apida-heritage-month/'
    url_two = 'https://codeup.edu/featured/women-in-tech-rachel-robbins-mayhill/'
    url_three = 'https://codeup.edu/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/'
    url_four =  'https://codeup.edu/events/women-in-tech-madeleine/'
    url_five = 'https://codeup.edu/codeup-news/panelist-spotlight-4/'
    
    #define the headers
    headers = {'User-Agent': 'Codeup Data Science'}
    
    #define the responses
    response_one = get(url_one, headers = headers)
    response_two = get(url_two, headers = headers)
    response_three = get(url_three, headers = headers)
    response_four = get(url_four, headers = headers)
    response_five = get(url_five, headers = headers)
    
    #define the soups
    soup_one = BeautifulSoup(response_one.content, 'html.parser')
    soup_two = BeautifulSoup(response_two.content, 'html.parser')
    soup_three = BeautifulSoup(response_three.content, 'html.parser')
    soup_four = BeautifulSoup(response_four.content, 'html.parser')
    soup_five = BeautifulSoup(response_five.content, 'html.parser')
    
    #define the articles
    article_one = soup_one.find('div', class_='entry-content')
    article_two = soup_two.find('div', class_='entry-content')
    article_three = soup_three.find('div', class_='entry-content')
    article_four = soup_four.find('div', class_='entry-content')
    article_five = soup_five.find('div', class_='entry-content')
    
    #make the dicts
    blog_one = {'title': soup_one.title.string,
            'content': article_one.text,}
    blog_two = {'title': soup_two.title.string,
            'content': article_two.text,}
    blog_three = {'title': soup_three.title.string,
            'content': article_three.text,}
    blog_four = {'title': soup_four.title.string,
            'content': article_four.text,}
    blog_five = {'title': soup_five.title.string,
            'content': article_five.text,}
    
    #append to list
    blog_dict_list.append(blog_one)
    blog_dict_list.append(blog_two)
    blog_dict_list.append(blog_three)
    blog_dict_list.append(blog_four)
    blog_dict_list.append(blog_five)
    
    return blog_dict_list

In [16]:
codeup_df = pd.DataFrame(get_blog_articles())

In [18]:
codeup_df.columns = codeup_df.rename('content', 'original')

TypeError: DataFrame.rename() takes from 1 to 2 positional arguments but 3 were given