In [1]:
import pandas as pd
import util_data
import os
import numpy as np

In [2]:
train_path = os.path.join(os.path.curdir, 'processed', 'split', 'ClusterLevelCombined_5yrIMR_MatEd_train.csv')
article_embeddings_dir = os.path.join(os.curdir, 'raw', 'wikipedia', 'doc2vec_embeddings')
cluster_article_rank_dist_path = os.path.join(os.curdir, 'processed',
                                                       'ClusterNearestArticles_Rank_Dist.csv')
task = 'MatEd'
country_train = ['Ghana']

In [3]:
dataset = util_data.DHS_Wiki_Dataset(DHS_csv_file=train_path,
                        emb_root_dir=article_embeddings_dir, cluster_rank_csv_path=cluster_article_rank_dist_path,
                        emb_dim=300, n_articles=5, include_dists=False,
                        country_subset=country_train, task=task,
                        transforms=None)

In [55]:
embs = []
for filename in os.listdir(article_embeddings_dir):
    article_idx = filename.split('.')[0]
    emb = np.load(os.path.join(article_embeddings_dir, filename))
    embs.append(emb)
emb_mat = np.matrix(embs)
emb_mat

matrix([[ 0.42156854, -0.07489026,  0.12658575, ...,  0.06791696,
         -0.16471495, -0.03271519],
        [ 0.46025574, -0.48072433, -0.3189665 , ...,  0.17713244,
          0.01003557,  0.5328643 ],
        [-0.00242988,  0.35542473, -0.20284058, ..., -0.36913994,
         -0.21822122,  0.03310431],
        ...,
        [ 0.45647347,  0.18423879,  0.03892373, ...,  0.31584007,
          0.22475618,  0.04668438],
        [ 0.14622222, -0.14869782, -0.22864702, ...,  0.24685656,
          0.06191548,  0.54626304],
        [ 0.1584616 ,  0.1950602 , -0.10463457, ...,  0.43515274,
          0.18251257,  0.2221279 ]], dtype=float32)

In [56]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(emb_mat)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [57]:
pca.explained_variance_ratio_

array([0.15167244, 0.06533204], dtype=float32)

In [64]:
xs, ys, scores = [], [], []
coords = set()
for cluster in dataset:
    article_embs, label = cluster['x'], cluster['y']
    article_embs = article_embs.reshape((-1, 300)).mean(axis=0, keepdims=True)
    pc1, pc2 = pca.transform(article_embs)[0]
    coords.add((pc1,pc2))
    xs.append(pc1)
    ys.append(pc2)
    ed_score = 0
    for i in range(4):
        ed_score += i * label[i]
    scores.append(ed_score)


In [59]:
import plotly.graph_objects as go

In [92]:
#### print(len(xs), len(ys), len(scores))
fig = go.Figure(data=go.Scatter(
    x = xs,
    y = ys,
    mode='markers',
    marker=dict(
        size=16,
        color=scores, #set color equal to a variable
        colorscale='Viridis', # one of plotly colorscales
        showscale=True,
        colorbar_title = 'Maternal Education Score',
        cmin = 0,
        cmax = 3
    )
))

fig.update_layout(title='Test',
                plot_bgcolor='rgba(0,0,0,0)',
                xaxis=dict(title='First Principal Component'),
                yaxis=dict(title='Second Principal Component'))
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Gray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Gray')
fig.show()

In [93]:
len(coords)

24

In [94]:
dataset2 = util_data.DHS_Wiki_Dataset(DHS_csv_file=train_path,
                        emb_root_dir=article_embeddings_dir, cluster_rank_csv_path=cluster_article_rank_dist_path,
                        emb_dim=300, n_articles=5, include_dists=False,
                        country_subset=country_train, task='IMR',
                        transforms=None)

In [97]:
dataset2[0]['y'][0]

0.0

In [7]:
from sklearn.decomposition import PCA
def run_pca(emb_mat, n_components=2):
    pca = PCA(n_components=n_components)
    pca.fit(emb_mat)
    return pca

In [8]:
embs = []
scores = []
for cluster in dataset:
    article_embs, label = cluster['x'], cluster['y']
    embs.append(article_embs)
    if task == 'MatEd':
        ed_score = 0
        for i in range(4):
            ed_score += i * label[i]
        scores.append(ed_score)
    else:
        scores.append(label[0])
emb_mat = np.matrix(embs)
pca = run_pca(emb_mat)
pcs = pca.transform(emb_mat)
pcs

array([[-2.0211778e+00,  4.8067746e+00],
       [-1.9590318e+00,  4.9468350e+00],
       [-4.8526354e+00, -5.5764169e-03],
       [ 5.9910774e+00,  2.7303405e+00],
       [ 5.9910774e+00,  2.7303405e+00],
       [-2.5488336e+00,  4.2220263e+00],
       [ 2.7645797e-01, -2.0460765e+00],
       [ 2.7710705e+00, -4.6702051e+00],
       [ 2.7710705e+00, -4.6702051e+00],
       [-4.7222009e+00,  9.9534094e-03],
       [ 2.7645797e-01, -2.0460765e+00],
       [ 2.7645797e-01, -2.0460765e+00],
       [ 2.7710705e+00, -4.6702051e+00],
       [ 2.1439815e-01, -1.9513736e+00],
       [-2.0211778e+00,  4.8067746e+00],
       [ 2.7645797e-01, -2.0460765e+00],
       [ 5.9910774e+00,  2.7303405e+00],
       [ 5.9910774e+00,  2.7303405e+00],
       [-4.9191093e+00,  1.8244088e-03],
       [ 5.9910774e+00,  2.7303405e+00],
       [ 5.9910774e+00,  2.7303405e+00],
       [-4.8952050e+00,  1.6690886e-01],
       [ 5.9910774e+00,  2.7303405e+00],
       [ 2.7710705e+00, -4.6702051e+00],
       [ 7.59740