In [1]:
import os
import sys

import pandas as pd
import re
import requests

from requests import get
from bs4 import BeautifulSoup

import unicodedata
import json

import nltk
import spacy
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import acquire

In [2]:
original = acquire.get_codeup_article('https://codeup.com/codeups-data-science-career-accelerator-is-here/')['content']

In [3]:
def basic_clean(original):
    word = original.lower()
    word = unicodedata.normalize('NFKD', word)\
                                .encode('ascii', 'ignore')\
                                .decode('utf-8', 'ignore')
    word = re.sub(r"[^a-z0-9'\s]", '', word)
    word = word.replace('\n',' ')
    word = word.replace('\t',' ')
    return word

In [4]:
basic_clean(original)

' the rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in america data science is a method of providing actionable intelligence from data the data revolution has hit san antonio resulting in an explosion in data scientist positions across companies like usaa accenture booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecurity center and school of data science we built a program to specifically meet the growing demands of this industry our program will be 18 weeks long fulltime handson and projectbased our curriculum development and instruction is led by senior data scientist maggie giust who has worked at heb capital group and rackspace along with input from dozens of practitioners and hiring partners students will work with real data sets realistic problems

In [5]:
def tokenize(original):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(basic_clean(original))

In [6]:
tokenize(basic_clean(original))

['the',
 'rumors',
 'are',
 'true',
 'the',
 'time',
 'has',
 'arrived',
 'codeup',
 'has',
 'officially',
 'opened',
 'applications',
 'to',
 'our',
 'new',
 'data',
 'science',
 'career',
 'accelerator',
 'with',
 'only',
 '25',
 'seats',
 'available',
 'this',
 'immersive',
 'program',
 'is',
 'one',
 'of',
 'a',
 'kind',
 'in',
 'san',
 'antonio',
 'and',
 'will',
 'help',
 'you',
 'land',
 'a',
 'job',
 'in',
 'glassdoors',
 '1',
 'best',
 'job',
 'in',
 'america',
 'data',
 'science',
 'is',
 'a',
 'method',
 'of',
 'providing',
 'actionable',
 'intelligence',
 'from',
 'data',
 'the',
 'data',
 'revolution',
 'has',
 'hit',
 'san',
 'antonio',
 'resulting',
 'in',
 'an',
 'explosion',
 'in',
 'data',
 'scientist',
 'positions',
 'across',
 'companies',
 'like',
 'usaa',
 'accenture',
 'booz',
 'allen',
 'hamilton',
 'and',
 'heb',
 'weve',
 'even',
 'seen',
 'utsa',
 'invest',
 '70',
 'm',
 'for',
 'a',
 'cybersecurity',
 'center',
 'and',
 'school',
 'of',
 'data',
 'science',


In [7]:
def stem(original):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in original.split()]
    original_stemmed = ' '.join(stems)
    return original_stemmed

In [8]:
stem(basic_clean(original))

'the rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in america data scienc is a method of provid action intellig from data the data revolut ha hit san antonio result in an explos in data scientist posit across compani like usaa accentur booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecur center and school of data scienc we built a program to specif meet the grow demand of thi industri our program will be 18 week long fulltim handson and projectbas our curriculum develop and instruct is led by senior data scientist maggi giust who ha work at heb capit group and rackspac along with input from dozen of practition and hire partner student will work with real data set realist problem and the entir data scienc pipelin from collect to deploy they will receiv profession develop train in resu

In [9]:
def lemmatize(original):
    nlp = spacy.load('en', parse=True, tag=True, entity=True)
    doc = nlp(original) # process the text with spacy
    lemmas = [word.lemma_ for word in doc]
    original_lemmatized = ' '.join(lemmas)
    return original_lemmatized

In [10]:
lemmatize(basic_clean(original))

'  the rumor be true the time have arrive codeup have officially open application to -PRON- new datum science career accelerator with only 25 seat available this immersive program be one of a kind in san antonio and will help -PRON- land a job in glassdoor 1 good job in america datum science be a method of provide actionable intelligence from datum the datum revolution have hit san antonio result in an explosion in data scientist position across company like usaa accenture booz allen hamilton and heb -PRON- have even see utsa invest 70 m for a cybersecurity center and school of datum science -PRON- build a program to specifically meet the grow demand of this industry -PRON- program will be 18 week long fulltime handson and projectbase -PRON- curriculum development and instruction be lead by senior datum scientist maggie giust who have work at heb capital group and rackspace along with input from dozen of practitioner and hire partner student will work with real data set realistic probl

In [11]:
def remove_stopwords(original, extra_words=[], exclude_words=[]):
    tokenizer = ToktokTokenizer()

    stopword_list = stopwords.words('english')

    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)

    words = original.split()
    filtered_words = [w for w in words if w not in stopword_list]

    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')

    original_nostop = ' '.join(filtered_words)

    return original_nostop

In [12]:
remove_stopwords(basic_clean(original))

Removed 122 stopwords
---


'rumors true time arrived codeup officially opened applications new data science career accelerator 25 seats available immersive program one kind san antonio help land job glassdoors 1 best job america data science method providing actionable intelligence data data revolution hit san antonio resulting explosion data scientist positions across companies like usaa accenture booz allen hamilton heb weve even seen utsa invest 70 cybersecurity center school data science built program specifically meet growing demands industry program 18 weeks long fulltime handson projectbased curriculum development instruction led senior data scientist maggie giust worked heb capital group rackspace along input dozens practitioners hiring partners students work real data sets realistic problems entire data science pipeline collection deployment receive professional development training resume writing interviewing continuing education prepare smooth transition workforce focus applied data science immediate 

In [13]:
original = acquire.get_article_text()
original

'\nThe rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in\xa0Glassdoor’s #1 Best Job in America.\nData Science is a method of providing actionable intelligence from data.\xa0The data revolution has hit San Antonio,\xa0resulting in an explosion in Data Scientist positions\xa0across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen\xa0UTSA invest $70 M for a Cybersecurity Center and School of Data Science.\xa0We built a program to specifically meet the growing demands of this industry.\nOur program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along with input from dozens of practitioners and hiring partners. Stude

In [14]:
def prep_article(article):
    
    article_stemmed = stem(basic_clean(article['content']))
    article_lemmatized = lemmatize(article_stemmed)
    article_without_stopwords = remove_stopwords(article_lemmatized)
    
    article['stemmed'] = article_stemmed
    article['lemmatized'] = article_lemmatized
    article['clean'] = article_without_stopwords
    
    return article

In [15]:
article = {'title': 'Film body urges Godrej to make Raj Kapoor museum at RK Studio site | Entertainment News | Inshorts', 'category': 'entertainment', 'content': '\nFilm body urges Godrej to make Raj Kapoor museum at RK Studio site\n'}

In [16]:
# {
#     'title': 'the original title'.
#     'original': original,
#     'stemmed': article_stemmed,
#     'lemmatized': article_lemmatized,
#     'clean': article_without_stopwords
# }

prep_article(article = {'title': 'Film body urges Godrej to make Raj Kapoor museum at RK Studio site | Entertainment News | Inshorts', 'category': 'entertainment', 'content': '\nFilm body urges Godrej to make Raj Kapoor museum at RK Studio site\n'})

Removed 2 stopwords
---


{'title': 'Film body urges Godrej to make Raj Kapoor museum at RK Studio site | Entertainment News | Inshorts',
 'category': 'entertainment',
 'content': '\nFilm body urges Godrej to make Raj Kapoor museum at RK Studio site\n',
 'stemmed': 'film bodi urg godrej to make raj kapoor museum at rk studio site',
 'lemmatized': 'film bodi urg godrej to make raj kapoor museum at rk studio site',
 'clean': 'film bodi urg godrej make raj kapoor museum rk studio site'}

In [17]:
def prepare_article_data(corpus):
    transformed  = []
    for article in corpus:
        transformed.append(prep_article(article))
    return transformed

In [18]:
corpus = acquire.get_news_texts('business',
                        'sports',
                        'technology',
                        'entertainment')

/en/news/volkswagen-unit-porsche-fined-₹4100-crore-over-diesel-scandal-1557251200422
/en/news/guj-trader-who-gifted-cars-to-staff-to-remove-illegal-road-he-built-1557214245553
/en/news/spacexs-billionaire-moon-tourist-says-has-no-money-selling-art-1557239146967
/en/news/britannia-broke-rules-by-not-reporting-wadias-arrest-ingovern-1557234445948
/en/news/resigned-pledged-shares-provided-₹250-cr-to-banks-naresh-goyal-1557250273105
/en/news/ai-sent-₹2cr-to-nigeria-instead-of-us-firm-airline-says-probe-on-1557243035697
/en/news/us-warns-india-against-tariffs-over-scrapping-of-trade-benefits-1557245457108
/en/news/infosys-makes-hyd-staff-pay-for-parking-activists-call-it-illegal-1557230142855
/en/news/we-cant-ensure-cheaper-oil-sales-to-india-after-iran-sanctions-us-1557222796840
/en/news/indigo-talking-to-airbus-to-buy-yettobereleased-a321-xlr-jets-1557237922203
/en/news/unsure-on-galaxy-fold-shipping-will-cancel-us-preorders-samsung-1557244991830
/en/news/apple-features-warren-buffett-in-

In [19]:
corpus

[{'title': 'GoT makers admit Starbucks cup in episode was a mistake | Entertainment News | Inshorts',
  'category': 'entertainment',
  'content': '\nGoT makers admit Starbucks cup in episode was a mistake\n'},
 {'title': 'I do not know much about Balakot airstrikes: Sunny Deol | Politics News | Inshorts',
  'category': 'entertainment',
  'content': '\nI do not know much about Balakot airstrikes: Sunny Deol\n'},
 {'title': "False rape charge can ruin a man's life: Pooja defends Karan Oberoi | Metoo News | Inshorts",
  'category': 'entertainment',
  'content': "\nFalse rape charge can ruin a man's life: Pooja defends Karan Oberoi\n"},
 {'title': "It'll be dignified: Mental Hai Kya makers on clash with Super 30 | Entertainment News | Inshorts",
  'category': 'entertainment',
  'content': "\nIt'll be dignified: Mental Hai Kya makers on clash with Super 30\n"},
 {'title': 'Hollywood actor wears half-tuxedo, half-dress at Met Gala 2019 | Entertainment News | Inshorts',
  'category': 'enterta

In [None]:
prepare_article_data(corpus)