In [155]:
import pandas as pd
import numpy as np
from requests import get
from bs4 import BeautifulSoup
import os
import requests
import re

### See end of document for exercises

### Our goal is to make predictions using text

In [38]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [39]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

## 1. lowercase everything

In [40]:
original = original.lower()

## 2. remove accented characters and non-ASCII characters

In [41]:
import unicodedata
#wants a methodology for normalization

original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
#ASCII part gets rid of non-ASCII characters (changes)
#b" could be a marker denoting an encoded string

## 3. Remove any special characters that were not picked up

In [42]:
import re

In [48]:
original = re.sub('[^a-z0-9\'\s]', '', original)
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 4. Tokenize

### In entity recognition you label each token

In [49]:
import nltk

In [50]:
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x1561d8a00>

In [55]:
#Tokenizing separates
tokenize =tokenize.tokenize(original, return_str=True)
tokenize

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [54]:
#Tokenizing return string false
#If you set it to false, it turns it into a list
tokenize.tokenize(original, return_str=False)

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

## 5. Stemming OR Lemmatizing
### We are trying to create less unniqueness in the language
### Generally accepted to not do both

In [57]:
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [59]:
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [60]:
#using a word that cannot be easily stemmed
ps.stem('house')

'hous'

#### Stemming (in older texts) letters were used more profusely, you paid per letter for old printing press

In [63]:
ps.stem(original)

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necess"

In [61]:
ps.stem('contributed')

'contribut'

In [62]:
stems = [ps.stem(word) for word in original.split()]
' '.join(stems)

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

## Lemmatize

In [70]:
#Run the first time
#nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /Users/crislucin/

[nltk_data]    | Downloading package paradigms to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping corpora/paradigms.zip.
[nltk_data]    | Downloading package pe08 to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping corpora/pe08.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping misc/perluniprops.zip.
[nltk_data]    | Downloading package pil to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping corpora/pil.zip.
[nltk_data]    | Downloading package pl196x to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping corpora/pl196x.zip.
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     /Users/crislucin/nltk_data...
[nltk_data]    |   Unzipping stemmers/porter_test.zip.
[nltk_data]    | Downloading package ppattach to
[nltk_data]    |     /Users/crislucin/nltk_data...
[n

True

In [64]:
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [65]:
nltk.stem.WordNetLemmatizer()

<WordNetLemmatizer>

In [66]:
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [72]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('call')

('calling', 'call', 'called', 'call')

In [73]:
wnl.lemmatize('house'), wnl.lemmatize('housing')

('house', 'housing')

In [74]:
ps.stem('mouse'), ps.stem('mice')

('mous', 'mice')

### Lemmatizer understands that mice is the plural of mouse

In [75]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [76]:
lemmas = [wnl.lemmatize(word) for word in original.split()]
' '.join(lemmas)

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

## 6. Remove Stopwords

In [77]:
from nltk.corpus import stopwords

In [78]:
# do this once 
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/crislucin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [79]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [80]:
# 179 in english (stopwords)
len(stopwords_english)

179

In [81]:
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [86]:
#To add to this list
stopwords_english.append('o')

In [87]:
len(stopwords_english)

181

In [88]:
#Return only words in original that arent in the stopwords list
[word for word in original.split() if word not in stopwords_english]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematicians',
 'contributed',
 'lot',
 'field',
 "erdos's",
 'name',
 'contains',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'double',
 'acute',
 'accent',
 'often',
 'incorrectly',
 'written',
 'erdos',
 'erdos',
 'either',
 'mistake',
 'typographical',
 'necessity']

# Exercises

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

#### Lowercase everything
#### Normalize unicode characters
#### Replace anything that is not a letter, number, whitespace or a single quote.

In [196]:
def basic_clean(string):
    '''This function applies basic text cleaning to a given
    string input, and returns the string after the methods have
    been applied'''
    #Make string lowercase
    string = string.lower()
    #Normalize unicode characters
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8')
    #Replace everything not a letter, number, whitespace or a single quote
    string = re.sub('[^a-z0-9\'\s]', '', string)
    
    return string

In [106]:
string = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [107]:
string_cleaned = basic_clean(string)

In [108]:
string_cleaned

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [102]:
#Initiate the object
tokenize = nltk.tokenize.ToktokTokenizer()

In [103]:
#Tokenizing the string
tokenize =tokenize.tokenize(string, return_str=True)
tokenize

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [111]:
def tokenize(string, return_str=True):
    #Initiate the object
    tokenize = nltk.tokenize.ToktokTokenizer()
    #Tokenize the string, default of return string true
    string_tokenized =tokenize.tokenize(string, return_str)
    
    return string_tokenized

In [112]:
string_tokenized = tokenize(string_cleaned, return_str=True)
string_tokenized

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [113]:
#Create the stem object
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [119]:
#Use the object to make stems
stems = [ps.stem(word) for word in string_cleaned.split()]
stems

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'were',
 'influenti',
 'hungarian',
 'mathematician',
 'who',
 'contribut',
 'a',
 'lot',
 'to',
 'the',
 'field',
 "erdos'",
 'name',
 'contain',
 'the',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'with',
 'doubl',
 'acut',
 'accent',
 'but',
 'is',
 'often',
 'incorrectli',
 'written',
 'as',
 'erdo',
 'or',
 'erdo',
 'either',
 'by',
 'mistak',
 'or',
 'out',
 'of',
 'typograph',
 'necess']

In [120]:
#Rejoin stems to make the string "whole"
string_stemmed = ' '.join(stems)

In [121]:
print(string_stemmed)

paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess


In [122]:
def stem(string):
    '''This function accepts a string and returns a string
    which has been stemmed'''
    #Create the stem object
    ps = nltk.porter.PorterStemmer()
    #Stem the string input
    stems = [ps.stem(word) for word in string.split()]
    #Rejoin the stems to reform string
    string_stemmed = ' '.join(stems)
    #Return stemmed string
    return string_stemmed

In [123]:
string_stemmed = stem(string_cleaned)

In [124]:
string_stemmed

"paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [125]:
#Initialize the object
wnl = nltk.stem.WordNetLemmatizer()

In [126]:
wnl

<WordNetLemmatizer>

In [128]:
#Use the object to lemmatize the words
lemmas = [wnl.lemmatize(word) for word in string_cleaned.split()]
lemmas

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 "erdos's",
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [129]:
#Rejoin the lemmatized words in the string to reform the string
string_lemmatized = ' '.join(lemmas)
string_lemmatized

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

In [130]:
def lemmatized(string):
    '''This function accepts a string input and applies lemmatization
    to the string, returning the lemmatized string'''
    #Initialize the object
    wnl = nltk.stem.WordNetLemmatizer()
    #Use the object to lemmatize the words
    lemmas = [wnl.lemmatize(word) for word in string_cleaned.split()]
    #Rejoin the lemmatized words in the string to reform the string
    string_lemmatized = ' '.join(lemmas)
    
    return string_lemmatized

In [131]:
string_lemmatized = lemmatized(string_cleaned)
string_lemmatized

"paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords

In [132]:
#Create a list of stopwords in specified language
stopwords_english = stopwords.words('english')

In [134]:
#Turn string into a list
words = string.split()

In [None]:
filtered_words = [w for w in words if w not in stopword_list]

In [None]:
string_without_stopwords = ' '.join(filtered_words)

In [142]:
def remove_stopwords(string, language='english'):
    #define the language used, creating a list
    stopword_language = stopwords.words(language)
    #Turn string into a list
    words = string.split()
    #Apply filter to string
    filtered_words = [w for w in words if w not in stopword_language]
    #Place string back together
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [143]:
string_without_stopwords = remove_stopwords(string_cleaned, language='english')

In [144]:
string_without_stopwords

"paul erdos george polya influential hungarian mathematicians contributed lot field erdos's name contains hungarian letter 'o' 'o' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df

In [146]:
url = 'https://inshorts.com/en/read'
response = get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [147]:
# catorgories = ['business', 'sports', 'technology', 'entertainment']
categories = [li.text.lower() for li in soup.select('li')][1:]
categories[0] = 'national'

inshorts = []

for category in categories:
    
    url = 'https://inshorts.com/en/read' + '/' + category
    response = get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    titles = [span.text for span in soup.find_all('span', itemprop='headline')]
    contents = [div.text for div in soup.find_all('div', itemprop='articleBody')]
    
    for i in range(len(titles)):
        
        article = {
            'title': titles[i],
            'content': contents[i],
            'category': category,
        }
        
        inshorts.append(article)

In [149]:
news_df = pd.DataFrame(inshorts)
news_df

Unnamed: 0,title,content,category
0,"Moscow-Goa flight gets bomb threat, makes emer...",A Goa-bound flight from Russia's Moscow made a...,national
1,"Joshimath divided into 3 zones, govt says most...","Uttarakhand's Joshimath, where a majority of b...",national
2,Which states have reported COVID-19 variant XB...,"One new case of COVID-19 variant XBB.1.5, whic...",national
3,I decided to wear t-shirt till I shiver after ...,Congress leader Rahul Gandhi on Monday told re...,national
4,Vistara's Bhubaneswar-bound flight returns to ...,Vistara's flight UK 781 operating from Delhi t...,national
...,...,...,...
292,Electric two-wheeler sales cross 6-lakh mark i...,Electric two-wheeler (E2W) sales in 2022 hit a...,automobile
293,Tesla shares fall further after firm misses 20...,"Tesla's shares, which dipped roughly 65% last ...",automobile
294,"Tesla cuts Model 3, Model Y prices in China fo...",Tesla on Friday reduced prices for its Model 3...,automobile
295,Tesla owners in China demand refund after sudd...,Around 200 recent buyers of Tesla's Model Y an...,automobile


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [158]:
codeup_url = 'https://codeup.com/blog/'

In [162]:
headers = {'User-Agent': 'Codeup Data Science'}
response = requests.get(url, headers=headers)

In [165]:
response

<Response [200]>

In [166]:
def get_blog_urls(codeup_url, header = {'User-Agent': 'Codeup Data Science'}):
    soup = BeautifulSoup(requests.get(codeup_url, headers=header).content, 'html.parser')
    return [link['href'] for link in soup.select('a.more-link')]

In [168]:
get_blog_urls(codeup_url, header = {'User-Agent': 'Codeup Data Science'})

['https://codeup.com/data-science/become-a-data-scientist/',
 'https://codeup.com/employers/hiring-tech-talent/',
 'https://codeup.com/cloud-administration/cap-funding-options/',
 'https://codeup.com/dallas-info/it-professionals-dallas/',
 'https://codeup.com/codeup-news/codeup-voted-1-technical-school-in-dfw/',
 'https://codeup.com/tips-for-prospective-students/financing/codeups-scholarships/']

In [203]:
def get_blog_articles(url):
    urls = get_blog_urls(url, header = {'User-Agent': 'Codeup Data Science'})
    headers = {'User-Agent': 'Codeup Data Science'}
    output = []
    for blog in urls:
        article = BeautifulSoup(requests.get(blog, headers=headers).content)
        blog_output = {'title': article.select_one('h1.entry-title').text, 'content': article.select_one('div.entry-content').text.strip()}
        output.append(blog_output)
        
    return pd.DataFrame(output)

In [247]:
codeup_df = get_blog_articles(codeup_url)

In [248]:
codeup_df

Unnamed: 0,title,content
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...


### 8. For each dataframe, produce the following columns:

* title to hold the title
* original to hold the original article/post content
* clean to hold the normalized and tokenized original with the stopwords removed.
* stemmed to hold the stemmed version of the cleaned data.
* lemmatized to hold the lemmatized version of the cleaned data.

#### Codeup DF

In [178]:
codeup_df = codeup_df.rename(columns={'content' : 'original'})

In [179]:
codeup_df.original[0]

'Are you feeling unfulfilled in your work but want to avoid returning to the traditional educational route? Codeup can help! Starting over as a professional is daunting and not always ideal. Codeup can help you go from a career you are bored with, to a job that excites you in just 6 months!\nHere’s how…\nData Science Program\nDuring our 20-week program, you will have the opportunity to take your career to new heights with data science being one of the most needed jobs in tech.\nYou’ll gather data, then clean it, explore it for trends, and apply machine learning models to make predictions.\nUpon completing this program, you will know how to turn insights into actionable recommendations. You’ll be a huge asset to any company, having all the technical skills to become a data scientist with projects upon projects of experience under your belt.\nCodeup\nA common reason individuals opt not to change their careers is fear it is too late. Codeup has crafted a program that will guide you throug

In [235]:
codeup_df = clean_blog_content(codeup_df)

In [236]:
codeup_df

Unnamed: 0,title,original,clean
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa...","When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...,In honor of November being National Scholarshi...


In [175]:
codeup_df.columns

Index(['title', 'content'], dtype='object')

In [195]:
article

'in honor of november being national scholarship month wed like to highlight codeups current internal scholarship offerings\nthese scholarships are open to anyone who fits the criteria codeup values a diverse workspace and we encourage everyone regardless of background to pursue the career they desire weve crafted the following scholarships to assist in funding a career change with codeup\n\nwomen in tech scholarship\nthis scholarship is available to all women we define women as anyone who identifies themselves as a woman including transgender genderqueer and nonbinary women\nannie easley scholarship\nthis scholarship is available to students who identify as black or africanamerican\nminorities in tech scholarship\nthis scholarship is open to anyone who identifies as part of a minority group\npride scholarship\nthis scholarship is open to anyone who identifies as part of the lgbtqia community\nveteran scholarship\nthis scholarship is open to military veterans and military dependents\nf

In [None]:
df['clean'] = df.original.copy()

In [220]:
test_df = codeup_df.rename(columns={'content' : 'original'})
#codeu_df['clean'] = codeup_df['original'].copy()

In [249]:
test_df = codeup_df.copy()

In [254]:
codeup_df

Unnamed: 0,title,content
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...


In [269]:
codeup_df = get_blog_articles(codeup_url)

In [273]:
codeup_df

Unnamed: 0,title,content
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...


In [274]:
#Rename column from content to original
codeup_df = codeup_df.rename(columns={'content' : 'original'})

In [277]:
article_list = codeup_df.original.to_list()

In [279]:
print(article_list)

['Are you feeling unfulfilled in your work but want to avoid returning to the traditional educational route? Codeup can help! Starting over as a professional is daunting and not always ideal. Codeup can help you go from a career you are bored with, to a job that excites you in just 6 months!\nHere’s how…\nData Science Program\nDuring our 20-week program, you will have the opportunity to take your career to new heights with data science being one of the most needed jobs in tech.\nYou’ll gather data, then clean it, explore it for trends, and apply machine learning models to make predictions.\nUpon completing this program, you will know how to turn insights into actionable recommendations. You’ll be a huge asset to any company, having all the technical skills to become a data scientist with projects upon projects of experience under your belt.\nCodeup\nA common reason individuals opt not to change their careers is fear it is too late. Codeup has crafted a program that will guide you throu

In [283]:
codeup_df['clean'] = [basic_clean(i) for i in codeup_df.original]

In [284]:
codeup_df

Unnamed: 0,title,original,clean
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...,are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...,are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...,finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa...",when breaking into a new career it is importan...
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...,we are excited to announce that codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...,in honor of november being national scholarshi...


In [None]:
codeup_df['clean'] = tokenize(i) for i in codeup_df.clean

In [272]:
def clean_blog_content(df):
    #Rename original column to content
    df = df.rename(columns={'content' : 'original'})
    #Make a new column for clean data
    df['clean'] = df['original'].copy()
    #Use basic clean function to clean the text
    for string in df.clean:
        df['clean'] = basic_clean(string)
        df['clean'] = tokenize(string, return_str=True)
    return df

In [267]:
codeup_df = clean_blog_content(codeup_df)

In [268]:
codeup_df

Unnamed: 0,title,original,clean
0,Become a Data Scientist in 6 Months!,Are you feeling unfulfilled in your work but w...,Are you feeling unfulfilled in your work but w...
1,Hiring Tech Talent Around the Holidays,Are you a hiring manager having trouble fillin...,Are you a hiring manager having trouble fillin...
2,Cloud Administration Program New Funding Options,Finding resources to fund your educational goa...,Finding resources to fund your educational goa...
3,Why Dallas is a Great Location for IT Professi...,"When breaking into a new career, it is importa...","When breaking into a new career, it is importa..."
4,Codeup is ranked #1 Best in DFW 2022,We are excited to announce that Codeup ranked ...,We are excited to announce that Codeup ranked ...
5,Codeup’s Scholarship Offerings,In honor of November being National Scholarshi...,In honor of November being National Scholarshi...
