In [1]:
# best example can be found here: https://www.kaggle.com/bbose71/bbc-news-classification

import pandas as pd

csv_file = '/Users/dsinicro/Desktop/Reader Revenue/Data Viz Pennlive Phase 1/pennlive_dominant_topic_fromGV.csv'

df= pd.read_csv(csv_file)
df.head()

Unnamed: 0,website_url,first_publish_date,primary_section,content,clavis_topics,dominant_topic,dominant_topic_score,pageviews
0,/opinion/2019/01/robert-frost-wrote-this-maste...,2019-01-02 13:24:53.462000+00:00,Opinion,<b>By Steve Hendrix </b> Whose words these are...,[],Opinion,0.04,624.0
1,/opinion/2019/01/only-1-out-of-36-newly-electe...,2019-01-02 15:11:34.395000+00:00,Opinion,,[],Opinion,0.04,2.0
2,/opinion/2019/01/can-washington-end-its-shutdo...,2019-01-02 15:51:22.092000+00:00,Opinion,<b>By E. Fletcher McClellan</b> Divided govern...,"['Federal Courts and SCOTUS', 'Congressional E...",Opinion,0.04,390.0
3,/opinion/2019/01/what-does-the-new-year-hold-f...,2019-01-03 13:00:27.090000+00:00,Opinion,"<b>By John Dame</b> Typically, this is the sea...",['Strategy and Management'],Opinion,0.04,242.0
4,/opinion/2019/01/this-should-be-the-no-1-prior...,2019-01-03 13:39:16.663000+00:00,Opinion,<b>By Hugh Hewitt</b> Congressional Republican...,['Congress'],Opinion,0.04,535.0


In [2]:
# only keep columns that I need
df2= df[['content','dominant_topic']]
df2.head()

Unnamed: 0,content,dominant_topic
0,<b>By Steve Hendrix </b> Whose words these are...,Opinion
1,,Opinion
2,<b>By E. Fletcher McClellan</b> Divided govern...,Opinion
3,"<b>By John Dame</b> Typically, this is the sea...",Opinion
4,<b>By Hugh Hewitt</b> Congressional Republican...,Opinion


In [3]:
# drop NaN for now
df2= df2.dropna()

In [4]:
# make new column that shows categorical numbers for categories so machine can read

df2['category_id'] = df2.dominant_topic.factorize()[0] 
df2.head()

Unnamed: 0,content,dominant_topic,category_id
0,<b>By Steve Hendrix </b> Whose words these are...,Opinion,0
2,<b>By E. Fletcher McClellan</b> Divided govern...,Opinion,0
3,"<b>By John Dame</b> Typically, this is the sea...",Opinion,0
4,<b>By Hugh Hewitt</b> Congressional Republican...,Opinion,0
6,<b>By Bob Quarteroni</b> I’ve long been obsess...,Opinion,0


In [5]:
df2.describe()

Unnamed: 0,category_id
count,21113.0
mean,10.099986
std,8.225374
min,0.0
25%,2.0
50%,8.0
75%,18.0
max,27.0


In [6]:
# print all categories
df2['dominant_topic'].value_counts()

Crime                     2632
Penn State Football       1519
Sports                    1366
Criminal Justice          1341
Opinion                   1301
HS Sports                 1210
Politics                  1000
Traffic                    984
Weather                    966
NFL - Eagles               829
L&C - Entertainment        801
L&C - Events               767
NFL - Ravens               722
Education/Schools          623
NFL - Steelers             593
The Tylt                   583
L&C - Food & Dining        566
Business                   528
Healthcare                 481
L&C - Nature/Seasonal      468
Retail                     350
Advice                     313
Environment                301
Obituaries                 243
Restaurant Inspections     191
Lottery                    171
HS Prom                    155
Carrers                    109
Name: dominant_topic, dtype: int64

In [7]:
# Create a new pandas dataframe "category_id_df", which only has unique Categories, also sorting this list in order of category_id values
category_id_df = df2[['dominant_topic', 'category_id']].drop_duplicates().sort_values('category_id')

In [8]:
# Create a dictionary ( python datastructure - like a lookup table) that 
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'dominant_topic']].values)

In [9]:
category_id_df

Unnamed: 0,dominant_topic,category_id
0,Opinion,0
9,Crime,1
40,Penn State Football,2
65,NFL - Ravens,3
70,HS Sports,4
83,Sports,5
91,The Tylt,6
121,Weather,7
126,NFL - Eagles,8
262,NFL - Steelers,9


In [10]:
# Create a dictionary ( python datastructure - like a lookup table) that 
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'dominant_topic']].values)

In [11]:
# Group the dataframe by categories and count items ( number of news articles) in each category
df2.groupby('dominant_topic').category_id.count()

dominant_topic
Advice                     313
Business                   528
Carrers                    109
Crime                     2632
Criminal Justice          1341
Education/Schools          623
Environment                301
HS Prom                    155
HS Sports                 1210
Healthcare                 481
L&C - Entertainment        801
L&C - Events               767
L&C - Food & Dining        566
L&C - Nature/Seasonal      468
Lottery                    171
NFL - Eagles               829
NFL - Ravens               722
NFL - Steelers             593
Obituaries                 243
Opinion                   1301
Penn State Football       1519
Politics                  1000
Restaurant Inspections     191
Retail                     350
Sports                    1366
The Tylt                   583
Traffic                    984
Weather                    966
Name: category_id, dtype: int64

In [12]:
import matplotlib

#Plot the distribution of news articles by category
df2.groupby('dominant_topic').category_id.count().plot.bar(ylim=0)

<matplotlib.axes._subplots.AxesSubplot at 0x1175b3950>

In [13]:


from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df2.content).toarray()
labels = df2.category_id

In [14]:
features.shape

(21113, 185625)

In [15]:
# Remember the dictionary created to map category names to a number ? 
category_to_id.items()

dict_items([('Opinion', 0), ('Crime', 1), ('Penn State Football', 2), ('NFL - Ravens', 3), ('HS Sports', 4), ('Sports', 5), ('The Tylt', 6), ('Weather', 7), ('NFL - Eagles', 8), ('NFL - Steelers', 9), ('L&C - Food & Dining', 10), ('Environment', 11), ('Education/Schools', 12), ('L&C - Events', 13), ('Business', 14), ('L&C - Nature/Seasonal', 15), ('HS Prom', 16), ('Obituaries', 17), ('L&C - Entertainment', 18), ('Carrers', 19), ('Politics', 20), ('Criminal Justice', 21), ('Advice', 22), ('Traffic', 23), ('Healthcare', 24), ('Retail', 25), ('Lottery', 26), ('Restaurant Inspections', 27)])

In [16]:
# The sorted function Converts dictionary items into a (sorted) list. 
# In subsequent steps - We will use this list to iterate over the categories
sorted(category_to_id.items())

[('Advice', 22),
 ('Business', 14),
 ('Carrers', 19),
 ('Crime', 1),
 ('Criminal Justice', 21),
 ('Education/Schools', 12),
 ('Environment', 11),
 ('HS Prom', 16),
 ('HS Sports', 4),
 ('Healthcare', 24),
 ('L&C - Entertainment', 18),
 ('L&C - Events', 13),
 ('L&C - Food & Dining', 10),
 ('L&C - Nature/Seasonal', 15),
 ('Lottery', 26),
 ('NFL - Eagles', 8),
 ('NFL - Ravens', 3),
 ('NFL - Steelers', 9),
 ('Obituaries', 17),
 ('Opinion', 0),
 ('Penn State Football', 2),
 ('Politics', 20),
 ('Restaurant Inspections', 27),
 ('Retail', 25),
 ('Sports', 5),
 ('The Tylt', 6),
 ('Traffic', 23),
 ('Weather', 7)]

In [17]:
# use chi square to calculate print
# Use chi-square analysis to find corelation between features (importantce of words) and labels(news category) 
from sklearn.feature_selection import chi2

import numpy as np

N = 30  # We are going to look for top 3 categories

#For each category, find words that are highly corelated to it
for cat, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)                   # Do chi2 analyses of all items in this category
  indices = np.argsort(features_chi2[0])                                  # Sorts the indices of features_chi2[0] - the chi-squared stats of each feature
  feature_names = np.array(tfidf.get_feature_names())[indices]            # Converts indices to feature names ( in increasing order of chi-squared stat values)
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]         # List of single word features ( in increasing order of chi-squared stat values)
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]          # List for two-word features ( in increasing order of chi-squared stat values)
  print("# '{}':".format(cat))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]))) # Print 3 unigrams with highest Chi squared stat
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat

# 'Advice':
  . Most correlated unigrams:
       . feel
       . sister
       . relationship
       . dad
       . feelings
       . divorced
       . friends
       . friend
       . rude
       . los
       . angeles
       . box
       . marriage
       . married
       . written
       . mother
       . founded
       . husband
       . van
       . abigail
       . ca
       . jeanne
       . pauline
       . buren
       . 90069
       . 69440
       . phillips
       . dearabby
       . abby
       . dear
  . Most correlated bigrams:
       . los angeles
       . abby recently
       . ohio dear
       . south dear
       . texas dear
       . florida dear
       . abby husband
       . _blank www
       . angeles ca
       . mother pauline
       . com box
       . abby href
       . phillips founded
       . phillips contact
       . pauline phillips
       . abby written
       . contact dear
       . founded mother
       . jeanne phillips
       . van buren
       . abigai

# 'HS Prom':
  . Most correlated unigrams:
       . angelonia
       . hosta
       . catmint
       . school
       . impatiens
       . geraniums
       . hatcheries
       . stocking
       . bloom
       . coreopsis
       . agastache
       . perennial
       . botanical
       . letort
       . brunnera
       . sedum
       . phlox
       . blooms
       . coneflower
       . euphorbia
       . snip
       . radisson
       . proms
       . held
       . weigel
       . salvia
       . photographers
       . eden
       . commencement
       . prom
  . Most correlated bigrams:
       . radisson hotel
       . county technical
       . held 17
       . george weigel
       . hershey grantville
       . held april
       . high school
       . photos html
       . held 11
       . forum harrisburg
       . photographers captured
       . view community
       . trout fishing
       . school 2019
       . school commencement
       . hotel camp
       . pennlive photographers
     

# 'Lottery':
  . Most correlated unigrams:
       . saturdays
       . rico
       . puerto
       . reset
       . pools
       . millions
       . payout
       . prize
       . 3x
       . 302
       . 292
       . 2x
       . 575
       . payment
       . numbers
       . megaplier
       . wednesdays
       . megamillions
       . tuesdays
       . islands
       . powerplay
       . sum
       . virgin
       . drawings
       . lump
       . mega
       . drawing
       . lottery
       . powerball
       . jackpot
  . Most correlated bigrams:
       . ticket gives
       . deadline purchase
       . saturdays deadline
       . gives 292
       . rico ticket
       . powerball held
       . champions drawings
       . joining hall
       . million chance
       . hall powerball
       . eastern wednesdays
       . 59 eastern
       . jackpot winner
       . millions jackpot
       . jackpot html
       . columbia virgin
       . drawings held
       . 292 million
       . millio

# 'Politics':
  . Most correlated unigrams:
       . pelosi
       . mueller
       . immigration
       . wolf
       . elections
       . campaign
       . political
       . politics
       . voters
       . congressional
       . committee
       . border
       . house
       . biden
       . democrat
       . congress
       . donald
       . lawmakers
       . rep
       . gop
       . president
       . republicans
       . sen
       . presidential
       . election
       . democrats
       . senate
       . republican
       . democratic
       . trump
  . Most correlated bigrams:
       . trump administration
       . mike pence
       . justice department
       . www legis
       . robert mueller
       . president mike
       . counsel robert
       . house senate
       . government shutdown
       . speaker nancy
       . judiciary committee
       . nancy pelosi
       . joe biden
       . state house
       . majority leader
       . tom wolf
       . democratic pres

In [19]:
### DS: Above was re-run on 2020-02-05 to find the lists of top 30 for each category