In [15]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
import json
import sys

In [16]:
sys.path.insert(1, '../cached_embeds/')

## Load Data

In [17]:
PATH_TO_TOPIC_EBD = 'huffpost_oracle_embed_idf.json'
PATH_TO_WIKI_EBD = 'ebd_cache.json'

In [18]:
# classes = [
#     'mideast', 'space', 'sale', 'politics', 'graphics',
#     'cryptography', 'windows', 'microsoft', 'guns',
#     'religion', 'autos', 'medicine', 'mac', 'electronics',
#     'hockey', 'atheism', 'motorcycles', 'pc', 'baseball', 'christian'
# ]

classes = [
    'politics', 'wellness', 'entertainment', 'travel', 'beauty', 'parenting', 'healthy', 'queer',
    'food', 'business', 'comedy', 'sports', 'black', 'home', 'parents', 'the worldpost',
    'weddings', 'women', 'impact', 'divorce', 'crime', 'media', 'weird', 'green', 'worldpost', 'religion',
    'style', 'science', 'worldnews', 'taste', 'tech', 'money', 'arts', 'fifty', 'goodnews', 'arts & culture',
    'environment', 'college', 'latino', 'culture & arts', 'education'
]

In [19]:
# Get topic or oracle embeddings
with open('../cached_embeds/{}'.format(PATH_TO_TOPIC_EBD)) as json_file:
    topics = json.load(json_file)

In [20]:
# Get embedding cache
with open('../{}'.format(PATH_TO_WIKI_EBD)) as json_file:
    # {(str): list of float}
    mappings = json.load(json_file)

In [37]:
# Cache topic embeddings if necessary
dest = '../cached_embeds/huffpost_topic_embed_.json'
cache = dict(zip(range(len(classes)), [mappings[topic] for topic in classes]))
with open(dest, 'w') as fp:
    json.dump(cache, fp)

In [21]:
# Reformat into two lists containing word and corresponding embedding
ordered_words = list(mappings.keys())
ordered_embs = [mappings[word] for word in ordered_words]

In [22]:
# Normalize
ordered_embs = normalize(ordered_embs, norm='l2', axis=1, copy=True, return_norm=False)

In [23]:
ordered_top_embs = [topics[str(int(key))] for key in range(len(classes))]
ordered_top_embs = normalize(ordered_top_embs, norm='l2', axis=1, copy=True, return_norm=False)

## Nearest Neighbors

In [26]:
def synonyms(classes, global_embs, class_embs):
    """
    Returns closest synonyms for classes.
    
    Args:
        classes: list of str of query words
        
    Returns:
        dict: {query word (str): [synonyms (strs)]}
    """
    X = np.array(global_embs)
    nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X)
    _, indices = nbrs.kneighbors(class_embs)
    
    synonyms = {}
    for i, ind in enumerate(indices):
        synonyms[classes[i]] = [ordered_words[j] for j in ind]
        
    return synonyms

In [27]:
syn_dict = synonyms(classes, ordered_embs, ordered_top_embs)
pd.DataFrame(syn_dict)

KeyboardInterrupt: 

In [29]:
syn_dict_1 = {}
for key in syn_dict:
    syn_dict_1[classes[key]] = syn_dict[key]
    
syn_dict_1

{'politics': ['⁃', 'that', '提舉司', 'bloffwitch', '民盟'],
 'wellness': ['that', 'even', 'really', 'actually', 'but'],
 'entertainment': ['bones+longing',
  '༺།།ༀ་ཨཱཿ་ཧཱུྃ།།འཚེར།།།།སར་ཝ་མང་ག་ལམ།།༻',
  'girlfriend/fiancee',
  'bloffwitch',
  'errtime'],
 'travel': ['⁃', '∈ℝ', 'but', '◩', 'that'],
 'beauty': ['༺།།ༀ་ཨཱཿ་ཧཱུྃ།།འཚེར།།།།སར་ཝ་མང་ག་ལམ།།༻',
  'simplyfying',
  '⁃',
  'simplying',
  'walk…'],
 'parenting': ['really',
  'actually',
  'that',
  'even',
  '༺།།ༀ་ཨཱཿ་ཧཱུྃ།།འཚེར།།།།སར་ཝ་མང་ག་ལམ།།༻'],
 'healthy': ['that', 'but', 'even', 'people—especially', 'actually'],
 'queer': ['⁃',
  '平和を願い真の国益を考え靖国神社参拝を支持する若手国会議員の会',
  '༺།།ༀ་ཨཱཿ་ཧཱུྃ།།འཚེར།།།།སར་ཝ་མང་ག་ལམ།།༻',
  'that',
  '∈ℝ'],
 'food': ['morenoodles',
  'buttered',
  'appetizing',
  'appetising',
  'steakburgers'],
 'business': ['⁃', 'that', '∈ℝ', 'even', 'but'],
 'comedy': ['༺།།ༀ་ཨཱཿ་ཧཱུྃ།།འཚེར།།།།སར་ཝ་མང་ག་ལམ།།༻',
  'bloffwitch',
  'bones+longing',
  'blitzgiving',
  'swezzle'],
 'sports': ['⁃',
  '民盟',
  '༺།།ༀ་ཨཱཿ་ཧཱུྃ།།འཚེར།།།།སར

In [24]:
print(list(syn_dict.keys()))

['politics', 'wellness', 'entertainment', 'travel', 'beauty', 'parenting', 'healthy', 'queer', 'food', 'business', 'comedy', 'sports', 'black', 'home', 'parents', 'worldpost', 'weddings', 'women', 'impact', 'divorce', 'crime', 'media', 'weird', 'green', 'religion', 'style', 'science', 'worldnews', 'taste', 'tech', 'money', 'arts', 'fifty', 'goodnews', 'culture', 'environment', 'college', 'latino', 'education']


## Caching Human-Understandable Synonyms

Where "synonym" refers to similar words thematically.

In [86]:
# Good synonyms: include all keywords from og 20news title
sample_synonyms = [
    ['mideast', 'saudi', 'opec', 'arab', 'politics'],
    ['space', 'astronomy', 'cosmology', 'science', 'planet'],
    ['sale', 'resale', 'purchase', 'buy', 'market'],
    ['politics', 'government', 'policy', 'talk', 'campaign'],
    ['graphics', 'computer', 'image', 'png', 'visuals'],
    ['cryptography', 'cybersecurity', 'primes', 'cryptanalysis', 'science'],
    ['windows', 'microsoft', 'ten', 'computer', 'version'],
    ['microsoft', 'os', 'computer', 'windows', 'software'],
    ['guns', 'talk', 'politics', 'firearm', 'weapon'],
    ['religion', 'talk', 'belief', 'god', 'philosophy'],
    ['autos', 'recreation', 'car', 'vehicle', 'drive'],
    ['medicine', 'science', 'doctor', 'pharmaceutical', 'disease'],
    ['mac', 'computer', 'system', 'hardware', 'apple'],
    ['electronics', 'science', 'digital', 'devices', 'technology'],
    ['hockey', 'sport', 'recreation', 'skating', 'ice'],
    ['atheism', 'alternative', 'agnosticism', 'apatheism', 'areligious'],
    ['motorcycles', 'recreation', 'motorbike', 'scooter', 'moped'],
    ['pc', 'ibm', 'hardware', 'system', 'computer'],
    ['baseball', 'sport', 'rec', 'mlb', 'softball'],
    ['christian', 'religion', 'society', 'catholic', 'protestant']
]

In [53]:
syns_hpost = [['politics', 'policy', 'government', 'diplomacy', 'legislature'], 
              ['wellness', 'health', 'wellbeing', 'fitness', 'relaxation'], 
              ['entertainment', 'media', 'celebrities', 'leisure', 'hollywood'], 
              ['travel', 'voyage', 'touring', 'globetrotting', 'trip'], 
              ['beauty', 'makeup', 'style', 'glamour', 'skincare'], 
              ['parenting', 'kids', 'family', 'discipline', 'childrearing'], 
              ['healthy', 'living', 'fitness', 'diet', 'nutrition'], 
              ['queer', 'lgbtq', 'gay', 'trans', 'sexuality'], 
              ['food', 'cooking', 'recipe', 'nutrition', 'kitchen'], 
              ['business', 'corporation', 'employment', 'entrepreneur', 'shop'], 
              ['comedy', 'humor', 'entertainment', 'funny', 'satire'], 
              ['sports', 'teams', 'athlete', 'fitness', 'competition'], 
              ['black', 'africanamerican', 'afroamerican', 'minority', 'race'], 
              ['home', 'house', 'family', 'residence', 'household'], 
              ['parents', 'parenting', 'elderly', 'mother', 'father'], 
              ['worldpost', 'international', 'news', 'world', 'huffingtonpost'], 
              ['weddings', 'marriage', 'newlywed', 'ceremony', 'bride'], 
              ['women', 'female', 'girl', 'feminism', 'lady'], 
              ['impact', 'effect', 'influence', 'movements', 'outcome'], 
              ['divorce', 'annulment', 'separation', 'breakup', 'adultery'], 
              ['crime', 'violence', 'felony', 'misdemeanour', 'offense'], 
              ['media', 'entertainment', 'reporter', 'journalism', 'news'], 
              ['weird', 'odd', 'strange', 'unexpected', 'abnormal'], 
              ['green', 'environment', 'nature', 'animals', 'habitat'], 
              ['worldpost', 'international', 'news', 'world', 'huffingtonpost'], 
              ['religion', 'christianity', 'islam', 'judaism', 'beliefs'], 
              ['style', 'fashion', 'skincare', 'clothes', 'haircare'], 
              ['science', 'experiments', 'discovery', 'technology', 'research'], 
              ['worldnews', 'world', 'news', 'international', 'countries'], 
              ['taste', 'recipes', 'cooking', 'cookware', 'kitchen'], 
              ['tech', 'technology', 'software', 'siliconvalley', 'internet'], 
              ['money', 'investing', 'finance', 'wealth', 'spending'], 
              ['arts', 'painting', 'dance', 'drawing', 'music'], 
              ['fifty', 'elderly', 'older', 'seniors', 'boomer'], 
              ['goodnews', 'positive', 'optimistic', 'feelgood', 'happy'], 
              ['culture', 'subculture', 'dance', 'society', 'music'], # arts and culture
              ['environment', 'sustainability', 'conservation', 'ecology', 'climate'], 
              ['college', 'tuition', 'university', 'degree', 'undergraduate'], 
              ['latino', 'hispanic', 'spanish', 'minority', 'race'], 
              ['culture', 'subculture', 'dance', 'society', 'music'], 
              ['education', 'scholarship', 'teaching', 'school', 'learning']
             ]
len(syns_hpost)

41

In [55]:
unique_classes = [syns[0] for syns in syns_hpost]
assert unique_classes == classes

syn_dict2 = dict(zip(classes, syns_hpost))
pd.DataFrame(syn_dict2)

Unnamed: 0,politics,wellness,entertainment,travel,beauty,parenting,healthy,queer,food,business,...,tech,money,arts,fifty,goodnews,culture,environment,college,latino,education
0,politics,wellness,entertainment,travel,beauty,parenting,healthy,queer,food,business,...,tech,money,arts,fifty,goodnews,culture,environment,college,latino,education
1,policy,health,media,voyage,makeup,kids,living,lgbtq,cooking,corporation,...,technology,investing,culture,elderly,positive,culture,sustainability,tuition,hispanic,scholarship
2,government,wellbeing,celebrities,touring,style,family,fitness,gay,recipe,employment,...,software,finance,dance,older,optimistic,dance,conservation,university,spanish,teaching
3,diplomacy,fitness,leisure,globetrotting,glamour,discipline,diet,trans,nutrition,entrepreneur,...,siliconvalley,wealth,society,seniors,feelgood,society,ecology,degree,minority,school
4,legislature,relaxation,hollywood,trip,skincare,childrearing,nutrition,sexuality,kitchen,shop,...,internet,spending,music,boomer,happy,music,climate,undergraduate,race,learning


In [58]:
syn_dict2

{'politics': ['politics', 'policy', 'government', 'diplomacy', 'legislature'],
 'wellness': ['wellness', 'health', 'wellbeing', 'fitness', 'relaxation'],
 'entertainment': ['entertainment',
  'media',
  'celebrities',
  'leisure',
  'hollywood'],
 'travel': ['travel', 'voyage', 'touring', 'globetrotting', 'trip'],
 'beauty': ['beauty', 'makeup', 'style', 'glamour', 'skincare'],
 'parenting': ['parenting', 'kids', 'family', 'discipline', 'childrearing'],
 'healthy': ['healthy', 'living', 'fitness', 'diet', 'nutrition'],
 'queer': ['queer', 'lgbtq', 'gay', 'trans', 'sexuality'],
 'food': ['food', 'cooking', 'recipe', 'nutrition', 'kitchen'],
 'business': ['business', 'corporation', 'employment', 'entrepreneur', 'shop'],
 'comedy': ['comedy', 'humor', 'entertainment', 'funny', 'satire'],
 'sports': ['sports', 'teams', 'athlete', 'fitness', 'competition'],
 'black': ['black', 'africanamerican', 'afroamerican', 'minority', 'race'],
 'home': ['home', 'house', 'family', 'residence', 'househol

In [56]:
def cache_syn_dict(classes, dict_to_cache, dest):
    """
    Caches dict of synonyms as {topic index: list of list of embeddings} to dest (str).
    """
    d = {}
    for i, topic in enumerate(classes):
        syns = dict_to_cache[topic]
        d[i] = [mappings[syn] for syn in syns]
        
    with open(dest, 'w') as json_file:
        json.dump(d, json_file)
    
    print('done caching')

In [57]:
cache_syn_dict(classes, syn_dict2, '../cached_embeds/huffpost_topic_embed_synonyms2_.json') 

done caching
