In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt

In [3]:
import pickle

In [4]:
import seaborn as sns

In [5]:
from sklearn.decomposition import PCA

In [6]:
from sklearn.manifold import TSNE
import numpy as np
import gensim

wem_newpath = "../Charter-school-identities/data/wem_model_train250_nostem_unlapped_300d.txt"
model = gensim.models.KeyedVectors.load_word2vec_format(wem_newpath)

## PCA followed by TSNE visualization of HDBSCAN Clustered WordVecs

The following uses PCA to reduce the dimensions of the 300k+ normalized word vecs which have 300 dimensions to 50. Then it selects the ~40K words which were clustered using HDBSCAN to be embedded using TSNE. TSNE requires data of at most ~50 dimensions and a few thousand inputs to run in reasonable amount of time.

In [7]:
word_vecs = model[model.vocab]
row_sums = np.linalg.norm(word_vecs, axis=1)
unit_vecs = word_vecs / row_sums[:, np.newaxis]

In [8]:
with open ('fullmodel_labels', 'rb') as fp:
    full_labels = pickle.load(fp)

In [9]:
words = np.array(list(model.vocab))[np.where(full_labels!=-1)[0]]

In [10]:
label_vecs = unit_vecs[np.where(full_labels!=-1)[0]]

In [11]:
pca = PCA(n_components = 50)
pca.fit(label_vecs)
transformed = pca.transform(label_vecs)

In [None]:
projection = TSNE(perplexity=50.0).fit_transform(label_vecs)
np.save('TSNE_approx',projection) # takes around 30 min

## Results

In [None]:
projection = np.load('TSNE_approx.npy')

In [None]:
len(projection)

In [None]:
import random
sample_ind = random.sample(range(len(projection)), 1000) # sample 1000 inds of the labeled/clustered wordvecs

In [None]:
len(sample_ind)

In [None]:
# not really informative
color_palette = sns.color_palette(n_colors = len(set(labels)))
cluster_colors = np.array([color_palette[x] for x in full_labels if x>=0])[sample_ind]
plt.figure(figsize=(16, 16)) 
for i, coord in enumerate(projection[sample_ind]):
    plt.scatter(*coord, s=50, linewidth=0, c=cluster_colors[i], alpha=.5)
    #plt.annotate(np.array(words)[sample_ind][i], (coord[0], coord[1]))

In [8]:
dict_path = '/home/jovyan/work/Charter-school-identities/dicts/'
# load dicts/wordvecs
dict_list = []
word_vecs_list = []
core_list = []
core_word_list = []
word_list = []
dict_names = ['inquiry', 'discipline']
for name in dict_names:
    with open(dict_path+name+'.txt') as f: 
        new_dict = f.read().splitlines()
        word_vecs = []
        core = []
        core_word = []
        word = []
        for i, entry in enumerate(new_dict):
            try:
                word_vecs.append(model[entry])
                word.append(entry)
                if i < 30:
                    core.append(model[entry])
                    core_word.append(entry)
            except:
                pass
        dict_list.append(new_dict)
        word_vecs_list.append(word_vecs)
        core_list.append(core)
        core_word_list.append(core_word)
        word_list.append(word)

## Dictionary Word Clusters

The following code checks which clusters(if any) the dictionary words were labeled. The discipline dictionary contained 5 words that were clustered while the inquiry dictionary contained none. 

In [9]:
labels = dict(zip(list(model.vocab), full_labels))

In [10]:
inq_clusters = [labels[word] for word in word_list[0]]
dis_clusters = [labels[word] for word in word_list[1]]

In [59]:
set(dis_clusters)

{-1, 178, 266, 342, 389}

In [11]:
np.where(np.array(dis_clusters) == 266)

(array([194, 308]),)

In [12]:
np.where(np.array(dis_clusters) == 342)

(array([161]),)

In [13]:
np.where(np.array(dis_clusters) == 389)

(array([353]),)

In [27]:
np.array(word_list[1])[np.where(np.array(dis_clusters) == 266)]

array(['large_stains', 'strapless_tops'],
      dtype='<U37')

In [25]:
word_clusters[266] # contains large_stains and strapless_tops

array(['navy_black', 'solid_in_color—red', 'pants_capris', 'solid_brown',
       'skirts_or_jumpers', 'sleeveless_shirts', 'blue_socks',
       'halter_tops', 'jumpers_and_shorts', 'y5-8', 'tan_brown',
       'oxford_cloth', 'socks_tights', 'pants_navy', 'bdu',
       'including_soles', 'skirts_shorts', 'knit_shirts', 'shoestrings',
       'tights_or_socks', 'mini-skirts_minidresses', 'capri_slacks',
       'plaid_skirts', 'plain_collared', 'backless_shoes', 'white_maroon',
       'polo-style', 'v-neck_sweater', 'white_or_burgundy', 'white_beige',
       'turtle_neck', 'dress_skirt', 'collars_or_polos',
       'leggings_are_not_permitted', 'sweater_vests', 'pim_hooded',
       'long-sleeve', 'nylons', 'button-down', 'black_slacks',
       'socks_solid', 'white_tights', 'ankle_socks', 'teal_black',
       'camisoles', 'twill_pants', 'earrings_and_other_body', 'tan_khaki',
       'round_neck', 'button-up', 'skorts_shorts', 'tan_navy',
       'mid-thigh_or_longer', 'button-up_collared', '

In [24]:
word_clusters[342] # contains 'hotel_pans'

array(['fax_202-690-7442', 'call_303-282-6437', '469-274-4876',
       '623-478-2344_ext', 'calling_833-216-safe',
       'becky.collier_thevanguardschool.com', '504-373-6269',
       'call_300-1506', 'defamatory_material', 'call_464-0771',
       'hotel_pans', 'appears_dazed', '1-888-722-1223_or_email',
       '671-5470', 'carton_of_low-fat', '505-373-0053', '612-314-7603',
       '612-314-7604'],
      dtype='<U431')

In [23]:
word_clusters[389] # contains 'utilizable'

array(['post_navigation', 'constellationschools.com', 'priority=_name=',
       ..., 'sportives', 'culturel', 'périscolaires'],
      dtype='<U431')

In [60]:
word_clusters[178] #cluster containing the entry 'smoke_a_cigarette' from discipline

array(['hs_a.61/ms', 'smoke_a_cigarette', 'property_were_you_harassed',
       'd.6', 'a.43-47//ms', 'a.48-52', 'a.63-71/ms', 'a.52-56', '65/ms',
       'a.53/ms', 'a.54/ms', 'a.64/ms', 'a.73-75/ms', 'a.58-60', 'a.55/ms',
       'a.107/ms', '98/ms', '99/ms', 'a.35-36/ms', 'a.34-36', 'a.61-62/ms',
       'a.50-51', 'a.72/ms', 'a.103-105/ms', 'a.85-87', 'a.100-102/ms',
       'a.82-84', 'a.106,108/ms', 'a.109-110/ms', 'a.91-92',
       'a.112,111/ms', 'a.113-118/ms', 'a.95-100', 'a.122/ms', 'a.120/ms',
       'a.113-117/ms', 'a.95-99', 'a.113/ms', 'a.123/ms', 'a.127-131//ms',
       'a.109-113', 'd.3'],
      dtype='<U431')

## Examining Some Word Clusters From HDBSCAN by Hand

In [8]:
word_clusters = [np.array(list(model.vocab))[np.where(full_labels == x)[0]] for x in set(full_labels)]

In [10]:
word_clusters[4] #living locations?

array(['living_in_motels', 'emergency_or_transitional', 'hotels_trailer',
       'parks_or_camping', 'adequate_accommodations',
       'shelters_are_abandoned', 'hospitals_or_are_awaiting',
       'accommodations_are_living', 'adequate_accommodation',
       'abandoned_in_a_hospital', 'parks_or_campgrounds',
       'shelters_or_are_abandoned', 'camping_grounds', 'motels_trailer'],
      dtype='<U431')

In [15]:
word_clusters[5]

array(['x1440_x2160', 'x2880_x3600', 'x4320_x5040', 'x5760_x6480',
       'x7200_x7920', 'x8640_qr', 'x0_x720', 'x9360_x10080',
       'x10800_x11520', 'x12240_x12960', 'x13680_x14400', 'x15120_x15840'],
      dtype='<U431')

In [16]:
word_clusters[6] #biology!

array(['brain_respiratory', 'functions_of_the_immune', 'digestive_bowel',
       'bladder_neurological', 'circulatory_endocrine', 'function_bowel',
       'function_neurological', 'function_circulatory', 'growth_bladder',
       'function_respiratory', 'function_endocrine', 'function_digestive',
       'reproductive_function'],
      dtype='<U431')

In [19]:
word_clusters[9] # names

array(['matt_jenkins', 'cj_fox', 'stephanie_olson', 'ridgedale_athletics',
       'ana_lopez', 'kari_rivers', 'lilith_werner', 'coumba_gueye',
       'kristie_sweeney', 'andrea_fisher', 'sebastien_babolat',
       'celine_dissel', 'baldomero_alvarez', 'fan_yang', 'zheng_wang',
       'perla_rivera', 'colleen_mcintee', 'adeline_leborgne',
       'kevin_carlino', 'abi_osthimer', 'debbie_harrington',
       'jeannie_brooks', 'haoli_collings', 'tzu_shan_huang',
       'alisa_kaczorowski', 'helen_blue-redner', 'wanda_aponte',
       'rebecca_schneider', 'christa_braun', 'gregoire_kande',
       'lorena_veresh', 'foz', 'yachin_lee', 'alyssa_dillard',
       'sabrina_poirier', 'tyler_berthelsen', 'nan_nan', 'christellem',
       'naxin_fan', 'karla_tarkington', 'april_wolcott'],
      dtype='<U431')

In [20]:
word_clusters[10] # more names

array(['older_post', 'next_entries', 'newer_entries',
       'apply_on_applitrack', 'tail-waggin_tutors', 'kelly_mcnerney',
       'erin_ellinger', 'cherie_mcelroy', 'shaundria_wise', 'tanner_lynn',
       'brad_fincher', 'lindsey_elders', 'lauriel_bristol',
       'levi_molenhour', 'brittny_meredith', 'terrell_borum', 'amber_hurd',
       'dave_funk', 'connor_kirk', 'kadence_niziurski',
       'melissa_strohmeyer', 'timothy_merritt', 'sumeet_sidhu',
       'nathan_maul', 'russell_cohoon', 'audrey_wiedemann',
       'rebekah_lischwe', 'travis_hollowood', 'erica_leong', 'steve_doll',
       'maegan_ruiz', 'chris_henkel', 'kat_mowczko'],
      dtype='<U431')

In [21]:
word_clusters[11] #legal jargon

array(['limitation_of_liability', 'jurisdictions_do_not_allow',
       'exclusion_of_certain', 'limitation_or_exclusion',
       'warranties_these_exclusions', 'accordingly_some_of_the_foregoing',
       'allow_the_limitation', 'exclusion_of_incidental',
       'consequential_or_other_types',
       'damages_so_some_of_the_above_limitations', 'incidental_damages',
       'liability_for_consequential', 'consequential_or_incidental',
       'limitations_on_implied', 'warranties_or_limitations',
       'exclusion_or_limitation', 'warranties_or_the_exclusion',
       'damages_so_the_limitations', 'states/jurisdictions_do_not_allow',
       'exclusion_of_liability', 'damages_the_above_limitation'],
      dtype='<U431')

In [22]:
word_clusters[12] # nonsense that should probably be cleaned

array(['finde_den', 'plan_er', 'stand_til', 'rigtige_kostume', 'med_en',
       'masse_kvalitet', 'og_opfylde', 'kundernes_behov', 'tøj_muligheder',
       'vi_sætter', 'en_ære', 'nødvendig_og', 'bør_være',
       'tã¸j_muligheder', 'vi_sã¦tter', 'en_ã¦re', 'nã¸dvendig_og',
       'bã¸r_vã¦re'],
      dtype='<U431')

In [23]:
word_clusters[13] # drugs

array(['opiate_addiction', 'center_in_belton', 'benzo_addiction',
       'ptsd_treatment', 'oxycodone_addiction', 'percocet_addiction',
       'ocd_treatment', 'self-harm_treatment', 'dementia_treatment',
       'painkiller_addiction'],
      dtype='<U431')

In [26]:
word_clusters[14:] # rest of them

[array(['til_vores', 'dette_produkt', 'blev_tilføjet', 'katalog_søndag',
        'katalog_torsdag', 'katalog_lørdag', 'katalog_onsdag',
        'katalog_tirsdag', 'katalog_mandag', 'katalog_fredag'],
       dtype='<U431'),
 array(['ee_ben', 'oasdi_medicare', 'oasdi_reg', 'strs_certificated',
        'benefits_cert', 'media_techno', 'cert_instructional',
        'cert_other_general', 'classified_plant', 'supervision_and_adm',
        'adm3502'],
       dtype='<U431'),
 array(['deductible_the_member', 'out-of-network_inpatient',
        'outpatient_copay', 'diagnostic_outpatient',
        'delivery_and_postpartum', 'deductible_diagnostic',
        'procedures_in-network', 'out-of-network_diagnostic',
        'complex_imaging', 'inpatient_maternity', 'freestanding_facility',
        'inpatient_after_deductible', 'x-ray_covered',
        'deductible_outpatient', 'inpatient_covered', 'transplants_covered'],
       dtype='<U431'),
 array(['including_asperger', 'deficiency_syndrome', 'major_d