<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Acquire-News-Articles" data-toc-modified-id="Acquire-News-Articles-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Acquire News Articles</a></span></li><li><span><a href="#Prepare-News-Articles" data-toc-modified-id="Prepare-News-Articles-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare News Articles</a></span></li><li><span><a href="#Explore-Lemmatized-Text" data-toc-modified-id="Explore-Lemmatized-Text-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Explore Lemmatized Text</a></span><ul class="toc-item"><li><span><a href="#Create-a-Series-for--Corpus-of-Words-by-Topic-Label" data-toc-modified-id="Create-a-Series-for--Corpus-of-Words-by-Topic-Label-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Create a Series for  Corpus of Words by Topic Label</a></span></li><li><span><a href="#Create-a-Series-of-Word-Frequencies-for-Each-Topic-Label" data-toc-modified-id="Create-a-Series-of-Word-Frequencies-for-Each-Topic-Label-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Create a Series of Word Frequencies for Each Topic Label</a></span></li><li><span><a href="#Create-df-of-Word-Frequencies-for-Each-Subset-Above" data-toc-modified-id="Create-df-of-Word-Frequencies-for-Each-Subset-Above-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Create df of Word Frequencies for Each Subset Above</a></span></li></ul></li></ul></div>

In [13]:
import pandas as pd
import numpy as np

import os
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from acquire_walkthrough import get_news_articles
from prepare_walkthrough import prep_article_data

## Acquire News Articles

In [14]:
df = get_news_articles()

## Prepare News Articles

In [15]:
df = prep_article_data(df)

In [16]:
df.head()

Unnamed: 0,topic,title,author,content,clean_stemmed,clean_lemmatized
0,business,Firm whose stock surged 1000% in 2020 starts h...,Krishna Veera Vanamali,US biotech company Novavax said it has started...,us biotech compani novavax said ha start phase...,u biotech company novavax said ha started phas...
1,business,India's economic growth seen at 1.2% in Q4 FY2...,Dharna,India's economy is estimated to have grown at ...,india economi estim grown 12 quarter end march...,india economy estimated grown 12 quarter ended...
2,business,TVS Motor cuts employees' salaries by up to 20...,Dharna,TVS Motor Company has said it is cutting the s...,tv motor compani ha said cut salari employe si...,tv motor company ha said cutting salary employ...
3,business,"Lockdown extensions won't help, cases will con...",Anushka Dixit,Mahindra Group Chairman Anand Mahindra said th...,mahindra group chairman anand mahindra said lo...,mahindra group chairman anand mahindra said lo...
4,business,Uber India fires 600 employees reducing 25% of...,Dharna,"Uber is firing 600 employees in India, or 25% ...",uber fire 600 employe india 25 workforc countr...,uber firing 600 employee india 25 workforce co...


In [17]:
# Select label and text and rename accordingly
df_lem = df[['topic', 'clean_lemmatized']]

df_lem = df_lem.rename(columns={'topic': 'label', 'clean_lemmatized': 'text'})

# df_lem uses lemmatized text
df_lem.head(1)

Unnamed: 0,label,text
0,business,u biotech company novavax said ha started phas...


## Explore Lemmatized Text

In [18]:
# Our scraped data is very balanced

labels = pd.concat([df_lem.label.value_counts(),
                    df_lem.label.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

Unnamed: 0,n,percent
business,25,0.252525
entertainment,25,0.252525
sports,25,0.252525
technology,24,0.242424


### Create a Series for  Corpus of Words by Topic Label

In [49]:
business_words = ' '.join(df_lem[df_lem.label == 'business'].text).split(' ')
business_words[:10]

['u',
 'biotech',
 'company',
 'novavax',
 'said',
 'ha',
 'started',
 'phase',
 '1',
 'clinical']

In [50]:
entertainment_words = ' '.join(df_lem[df_lem.label == 'entertainment'].text).split(' ')
entertainment_words[:10]

['south',
 'indian',
 'actor',
 'prithviraj',
 'sukumaran',
 'today',
 'shared',
 'picture',
 'physical',
 'transformation']

In [51]:
sports_words = ' '.join(df_lem[df_lem.label == 'sports'].text).split(' ')
sports_words[:10]

['exindia',
 'captain',
 'rahul',
 'dravid',
 'ha',
 'said',
 'idea',
 'creating',
 'biosecure',
 'environment']

In [52]:
technology_words = ' '.join(df_lem[df_lem.label == 'technology'].text).split(' ')
technology_words[:10]

['former',
 'nasa',
 'apple',
 'engineer',
 'mark',
 'rober',
 'built',
 'elaborate',
 'obstacle',
 'course']

In [53]:
all_words = ' '.join(df_lem.text).split(' ')
all_words[:10]

['u',
 'biotech',
 'company',
 'novavax',
 'said',
 'ha',
 'started',
 'phase',
 '1',
 'clinical']

### Create a Series of Word Frequencies for Each Topic Label

In [55]:
business_freq = pd.Series(business_words).value_counts()
business_freq

said          26
ha            19
india         16
company       15
million        9
              ..
member         1
completely     1
peru           1
tv             1
sbis           1
Length: 651, dtype: int64

In [56]:
entertainment_freq = pd.Series(entertainment_words).value_counts()
sports_freq = pd.Series(sports_words).value_counts()
technology_freq = pd.Series(technology_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

### Create df of Word Frequencies for Each Subset Above

In [57]:
word_counts = (pd.concat([business_freq, entertainment_freq, sports_freq, technology_freq, all_freq], axis=1, sort=True)
              .set_axis(['all', 'business', 'entertainment', 'sports', 'technology'], axis=1, inplace=False)
              .fillna(0)
              .apply(lambda s: s.astype(int)))

word_counts.head()

Unnamed: 0,all,business,entertainment,sports,technology
1,3,0,0,0,3
10,3,0,1,1,5
100,1,0,0,1,2
1000,1,0,0,1,2
10000,0,0,1,0,1


In [58]:
word_counts.sort_values(by='all', ascending=False).head(10)

Unnamed: 0,all,business,entertainment,sports,technology
said,26,23,15,13,77
ha,19,16,20,14,69
india,16,1,4,4,25
company,15,1,0,8,24
u,9,3,0,4,16
share,9,1,1,9,20
million,9,0,0,10,19
ceo,7,0,2,5,14
may,6,0,1,5,12
sold,6,0,0,4,10


###