In [1]:
import json
import pandas as pd
from itertools import groupby

### Load JSON

In [2]:
# Load JSON
with open('/data/news.json') as f:
  news_json = json.load(f)

In [3]:
# Reformat JSON as DataFrame
news_df = pd.DataFrame.from_dict(news_json['topics_data'])
news_df.head()

Unnamed: 0,article_id,publication,party,cluster,umap_embed1,umap_embed2
0,0,Breitbart,right,Terrorism and War,7.684704,3.341915
1,1,Breitbart,right,Politics,12.875513,1.706081
2,2,Breitbart,right,Crime,6.929716,1.806223
3,3,Breitbart,right,Politics,12.819369,1.664258
4,4,Breitbart,right,Politics,13.563458,2.722158


In [4]:
# Get value counts for cluster
news_df['cluster'].value_counts()

Politics                                 12238
Crime                                     4002
Terrorism and War                         1969
Global Warming                            1399
Pop Culture and Entertainment             1127
Food                                      1079
Nuclear                                    786
Business and Finance                       732
Music                                      650
Protesting and Activism                    571
Football                                   530
Baseball                                   457
Space                                      371
LGBTQ Discrimination                       350
Technology and Data Privacy                347
Real Estate                                339
Basketball                                 269
Olympics                                   261
Nazism and Syrian Refugee Crisis           252
Brexit                                     249
U.S. Illegal Immigration                   239
Medical Resea

### Undersample and Stratify JSON

In [5]:
# Undersample articles about politics
sample_df = news_df.groupby('cluster', group_keys=False).apply(lambda x: x.sample(min(len(x), 2000)))

# Sample 50% of the data
sample_df = sample_df.sample(frac=0.5, replace=False, random_state=1)

# Get value counts by cluster
sample_df['cluster'].value_counts()

Politics                                 1012
Crime                                    1002
Terrorism and War                         965
Global Warming                            669
Pop Culture and Entertainment             583
Food                                      556
Nuclear                                   393
Business and Finance                      362
Music                                     303
Protesting and Activism                   300
Football                                  261
Baseball                                  233
Space                                     186
Technology and Data Privacy               176
LGBTQ Discrimination                      171
Real Estate                               171
Basketball                                143
Nazism and Syrian Refugee Crisis          142
Brexit                                    126
Olympics                                  124
U.S. Illegal Immigration                  115
Widespread Disease                

### Reformat Sampled JSON

In [6]:
# Reformat DataFrame indices
sample_df = sample_df.rename(columns={'umap_embed1': 'x', 'umap_embed2': 'y'})
sample_df = sample_df.reset_index(drop=True)
sample_df['article_id'] = sample_df.index
sample_df.head()

Unnamed: 0,article_id,publication,party,cluster,x,y
0,0,New York Times,left,Global Warming,7.031024,-0.877911
1,1,Breitbart,right,Politics,11.346996,3.659848
2,2,New York Post,right,Baseball,8.793735,8.431417
3,3,CNN,left,Terrorism and War,7.762341,3.169827
4,4,New York Post,right,Terrorism and War,9.355626,3.813552


### Reshape Topics JSON

In [7]:
# Convert DataFrame back to JSON
topics_json = sample_df.to_dict('records')

# Sort list of dict articles by clustered topic
keyfunc = lambda a: a['cluster']
topics_json = sorted(topics_json, key=keyfunc)

# Append data to new dictionary where cluster is the key
topics_json = [{'id': k, 'data': list(v)} for k, v in groupby(topics_json, keyfunc)]
topics_json

[{'id': 'Baseball',
  'data': [{'article_id': 2,
    'publication': 'New York Post',
    'party': 'right',
    'cluster': 'Baseball',
    'x': 8.7937345505,
    'y': 8.4314174652},
   {'article_id': 14,
    'publication': 'New York Post',
    'party': 'right',
    'cluster': 'Baseball',
    'x': 8.7959604263,
    'y': 8.4640350342},
   {'article_id': 72,
    'publication': 'New York Post',
    'party': 'right',
    'cluster': 'Baseball',
    'x': 8.7370643616,
    'y': 8.5095205307},
   {'article_id': 88,
    'publication': 'New York Post',
    'party': 'right',
    'cluster': 'Baseball',
    'x': 8.7503681183,
    'y': 8.4776792526},
   {'article_id': 90,
    'publication': 'New York Post',
    'party': 'right',
    'cluster': 'Baseball',
    'x': 8.7906074524,
    'y': 8.4367742538},
   {'article_id': 201,
    'publication': 'New York Post',
    'party': 'right',
    'cluster': 'Baseball',
    'x': 8.7583618164,
    'y': 8.4339818954},
   {'article_id': 219,
    'publication': 'New Y

### Reshape Publication JSON

In [8]:
# Convert DataFrame back to JSON
pub_json = sample_df[['article_id', 'publication', 'x', 'y']].to_dict('records')

# Sort list of dict articles by clustered topic
keyfunc = lambda a: a['publication']
pub_json = sorted(pub_json, key=keyfunc)

# Append data to new dictionary where cluster is the key
pub_json = [{'id': k, 'data': list(v)} for k, v in groupby(pub_json, keyfunc)]
pub_json

[{'id': 'Breitbart',
  'data': [{'article_id': 1,
    'publication': 'Breitbart',
    'x': 11.3469963074,
    'y': 3.6598477364},
   {'article_id': 7,
    'publication': 'Breitbart',
    'x': 7.4818329811,
    'y': 3.1549882889},
   {'article_id': 15,
    'publication': 'Breitbart',
    'x': 12.6223669052,
    'y': 2.787409544},
   {'article_id': 16,
    'publication': 'Breitbart',
    'x': 10.1589374542,
    'y': 1.1489657164},
   {'article_id': 22,
    'publication': 'Breitbart',
    'x': 9.9096593857,
    'y': 2.7672934532},
   {'article_id': 24,
    'publication': 'Breitbart',
    'x': 8.5065689087,
    'y': 2.9607679844},
   {'article_id': 29,
    'publication': 'Breitbart',
    'x': 9.2552776337,
    'y': 6.2500424385},
   {'article_id': 32,
    'publication': 'Breitbart',
    'x': 13.0029525757,
    'y': 0.379919529},
   {'article_id': 37,
    'publication': 'Breitbart',
    'x': 10.9313592911,
    'y': 2.1313040257},
   {'article_id': 43,
    'publication': 'Breitbart',
    'x'

### Reshape Party JSON

In [9]:
# Convert DataFrame back to JSON
party_json = sample_df[['article_id', 'party', 'x', 'y']].to_dict('records')

# Sort list of dict articles by clustered topic
keyfunc = lambda a: a['party']
party_json = sorted(party_json, key=keyfunc)

# Append data to new dictionary where cluster is the key
party_json = [{'id': k, 'data': list(v)} for k, v in groupby(party_json, keyfunc)]
party_json

[{'id': 'left',
  'data': [{'article_id': 0,
    'party': 'left',
    'x': 7.0310235023,
    'y': -0.8779107928},
   {'article_id': 3, 'party': 'left', 'x': 7.7623405457, 'y': 3.1698265076},
   {'article_id': 5, 'party': 'left', 'x': 7.6893901825, 'y': 1.9707227945},
   {'article_id': 6, 'party': 'left', 'x': 7.4476103783, 'y': 2.8139321804},
   {'article_id': 8, 'party': 'left', 'x': 8.4558458328, 'y': -0.8291632533},
   {'article_id': 9, 'party': 'left', 'x': 12.0682144165, 'y': 1.0822399855},
   {'article_id': 11, 'party': 'left', 'x': 9.8811445236, 'y': 1.0774816275},
   {'article_id': 12, 'party': 'left', 'x': 11.1292352676, 'y': 0.670894444},
   {'article_id': 13, 'party': 'left', 'x': 8.3871383667, 'y': 4.0751957893},
   {'article_id': 17, 'party': 'left', 'x': 5.9681010246, 'y': -1.2124515772},
   {'article_id': 18, 'party': 'left', 'x': 8.770406723, 'y': -0.9668926001},
   {'article_id': 19, 'party': 'left', 'x': 8.0043430328, 'y': 3.6288704872},
   {'article_id': 20, 'party':

### Reshape TF-IDF JSON

In [10]:
# Initialize newly formatted JSON for TF-IDF data
tfidf_json = {}

# Reformat tfidf data
for t in news_json['tfidf_data']:
    tfidf_json[t['topic']] = t['top20_words']

# Take a glimpse at JSON!
tfidf_json

{'Baseball': [{'word': 'mets', 'tfidf': 0.026877324097796057},
  {'word': 'yankees', 'tfidf': 0.025945868048677555},
  {'word': 'baseball', 'tfidf': 0.01850179271297168},
  {'word': 'game', 'tfidf': 0.01612774451237032},
  {'word': 'season', 'tfidf': 0.01409456787280304},
  {'word': 'cubs', 'tfidf': 0.013302350373005988},
  {'word': 'innings', 'tfidf': 0.011392720954097716},
  {'word': 'games', 'tfidf': 0.011112108499799447},
  {'word': 'league', 'tfidf': 0.010706047821673486},
  {'word': 'players', 'tfidf': 0.009440662924812465},
  {'word': 'inning', 'tfidf': 0.008836579249453597},
  {'word': 'sox', 'tfidf': 0.007990999780282724},
  {'word': 'team', 'tfidf': 0.007895120763281678},
  {'word': 'field', 'tfidf': 0.007625623259327921},
  {'word': 'runs', 'tfidf': 0.007525789799817961},
  {'word': 'girardi', 'tfidf': 0.0074050519992552135},
  {'word': 'pitcher', 'tfidf': 0.007291349675533735},
  {'word': 'series', 'tfidf': 0.007220660055920209},
  {'word': 'cespedes', 'tfidf': 0.0069255823

### Save Web-Formatted JSON

In [11]:
# Write undersampled topics JSON to web directory
with open('/data/topics.json', 'w', encoding='utf-8') as f:
    json.dump(topics_json, f, ensure_ascii=False, indent=4)
    
# Write undersampled publication JSON to web directory
with open('/data/pub.json', 'w', encoding='utf-8') as f:
    json.dump(pub_json, f, ensure_ascii=False, indent=4)

# Write undersampled party JSON to web directory
with open('/data/party.json', 'w', encoding='utf-8') as f:
    json.dump(party_json, f, ensure_ascii=False, indent=4)

# Write undersampled TF-IDF JSON to web directory
with open('/data/tfidf.json', 'w', encoding='utf-8') as f:
    json.dump(tfidf_json, f, ensure_ascii=False, indent=4)