In [1]:
import json
import pandas as pd
from itertools import groupby

In [2]:
# Load JSON
with open('/data/news.json') as f:
  news_json = json.load(f)

In [3]:
# Reformat JSON as DataFrame
news_df = pd.DataFrame.from_dict(news_json['topics_data'])
news_df.head()

Unnamed: 0,article_id,publication,party,cluster,umap_embed1,umap_embed2
0,0,Breitbart,right,Terrorism and War,7.684704,3.341915
1,1,Breitbart,right,Politics,12.875513,1.706081
2,2,Breitbart,right,Crime,6.929716,1.806223
3,3,Breitbart,right,Politics,12.819369,1.664258
4,4,Breitbart,right,Politics,13.563458,2.722158


In [4]:
# Get value counts for cluster
news_df['cluster'].value_counts()

Politics                                 12238
Crime                                     4002
Terrorism and War                         1969
Global Warming                            1399
Pop Culture and Entertainment             1127
Food                                      1079
Nuclear                                    786
Business and Finance                       732
Music                                      650
Protesting and Activism                    571
Football                                   530
Baseball                                   457
Space                                      371
LGBTQ Discrimination                       350
Technology and Data Privacy                347
Real Estate                                339
Basketball                                 269
Olympics                                   261
Nazism and Syrian Refugee Crisis           252
Brexit                                     249
U.S. Illegal Immigration                   239
Medical Resea

In [5]:
# Undersample articles about politics
sample_df = news_df.groupby('cluster', group_keys=False).apply(lambda x: x.sample(min(len(x), 2000)))

# Sample 50% of the data
sample_df = sample_df.sample(frac=0.5, replace=False, random_state=1)

# Get value counts by cluster
sample_df['cluster'].value_counts()

Politics                                 1012
Crime                                    1002
Terrorism and War                         965
Global Warming                            669
Pop Culture and Entertainment             583
Food                                      556
Nuclear                                   393
Business and Finance                      362
Music                                     303
Protesting and Activism                   300
Football                                  261
Baseball                                  233
Space                                     186
Technology and Data Privacy               176
LGBTQ Discrimination                      171
Real Estate                               171
Basketball                                143
Nazism and Syrian Refugee Crisis          142
Brexit                                    126
Olympics                                  124
U.S. Illegal Immigration                  115
Widespread Disease                

In [6]:
# Reformat DataFrame indices
sample_df = sample_df.rename(columns={'umap_embed1': 'x', 'umap_embed2': 'y'})
sample_df = sample_df.reset_index(drop=True)
sample_df['article_id'] = sample_df.index
sample_df.head()

Unnamed: 0,article_id,publication,party,cluster,x,y
0,0,Breitbart,right,Global Warming,7.723316,-0.354369
1,1,Washington Post,left,Politics,13.504069,3.28746
2,2,New York Post,right,Baseball,8.792459,8.44012
3,3,Washington Post,left,Terrorism and War,8.331515,3.743477
4,4,New York Times,left,Terrorism and War,7.777199,3.061737


In [7]:
# Convert DataFrame back to JSON
topics_json = sample_df.to_dict('records')

# Sort list of dict articles by clustered topic
# topics_json = [{'id': i['cluster'], 'data': {'article_id': i['article_id'], 'publication': i['publication'], 'party': i['party'], 'umap_embed1': i['umap_embed1'], 'umap_embed2': i['umap_embed2']}} for i in topics_json]
keyfunc = lambda a: a['cluster']
topics_json = sorted(topics_json, key=keyfunc)

# Append data to new dictionary where cluster is the key
topics_json = [{'id': k, 'data': list(v)} for k, v in groupby(topics_json, keyfunc)]

In [8]:
# Initialize newly formatted JSON for TF-IDF data
tfidf_json = {}

# Reformat tfidf data
for t in news_json['tfidf_data']:
    tfidf_json[t['topic']] = t['top20_words']

# Take a glimpse at JSON!
tfidf_json

{'Baseball': [{'word': 'mets', 'tfidf': 0.026877324097796057},
  {'word': 'yankees', 'tfidf': 0.025945868048677555},
  {'word': 'baseball', 'tfidf': 0.01850179271297168},
  {'word': 'game', 'tfidf': 0.01612774451237032},
  {'word': 'season', 'tfidf': 0.01409456787280304},
  {'word': 'cubs', 'tfidf': 0.013302350373005988},
  {'word': 'innings', 'tfidf': 0.011392720954097716},
  {'word': 'games', 'tfidf': 0.011112108499799447},
  {'word': 'league', 'tfidf': 0.010706047821673486},
  {'word': 'players', 'tfidf': 0.009440662924812465},
  {'word': 'inning', 'tfidf': 0.008836579249453597},
  {'word': 'sox', 'tfidf': 0.007990999780282724},
  {'word': 'team', 'tfidf': 0.007895120763281678},
  {'word': 'field', 'tfidf': 0.007625623259327921},
  {'word': 'runs', 'tfidf': 0.007525789799817961},
  {'word': 'girardi', 'tfidf': 0.0074050519992552135},
  {'word': 'pitcher', 'tfidf': 0.007291349675533735},
  {'word': 'series', 'tfidf': 0.007220660055920209},
  {'word': 'cespedes', 'tfidf': 0.0069255823

In [9]:
# Merge undersampZled data back to JSON
merged_dict = {'topics_data': topics_json, 'tfidf_data': tfidf_json}

# Write undersampled DataFrame to JSON in web directory
with open('/data/news2.json', 'w', encoding='utf-8') as f:
    json.dump(merged_dict, f, ensure_ascii=False, indent=4)