### Imports

In [138]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
import torch.optim as optim

  from pandas import Panel


### Model Definition

In [38]:
class DocModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(DocModel, self).__init__()
        self.input_size = input_size
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        
        self.docvecpipeline = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size1),
            nn.ReLU(True),
            nn.Linear(self.hidden_size1, self.hidden_size2),
            nn.ReLU(True)
        )
        
        self.regressor = nn.Sequential(
            nn.Linear(self.hidden_size2, 1)
        )
        
    def forward(self, invecs, return_hidden=False):
        hidden = self.docvecpipeline(invecs)
        output = self.regressor(hidden)
        if return_hidden:
            return output, hidden
        else:
            return output

### Define Data to Use and Data Prep Pipeline

In [14]:
country = 'rwanda'
country_abrev = 'RWA'
year = 2010

In [24]:
country_wiki = pd.read_csv(f'articles/{country}_Wiki.csv')
dhs_clusts = dhs_clusts = pd.read_csv('data/dhs_clusters.csv')
country_clusts = country_clusts = dhs_clusts[(dhs_clusts['country']== country) & (dhs_clusts['year']== year)]
# Convert string to np array because it was stored stupidly. Will fix later
country_wiki['embedding'] = country_wiki['embedding'].apply(lambda x: np.fromstring(x[1:-1], 
                                                                                    sep=' '))

In [128]:
def compute_distance(c1, c2):
    '''
    Compute approx distance between two coords given in (lat, long)
    format. 
    '''
    # approximate radius of earth in km
    R = 6373.0
    lat1 = np.radians(c1[0])
    lon1 = np.radians(c1[1])
    lat2 = np.radians(c2[0])
    lon2 = np.radians(c2[1])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def test_compute_dist():
    '''
    Unit test for compute_distance function.
    
    Uses Berkeley School of Infromation and Moscone Center South in San Fransisco
    as a test case. Distance from Google maps. 
    '''
    moscone_south = (37.783957939867015, -122.40107973374062) # lat, long
    i_school = (37.871363468005065, -122.25852213941603) 
    correct_dist = 15.6 # in km
    computed_dist = compute_distance(moscone_south, i_school)
    np.testing.assert_approx_equal(computed_dist, correct_dist, significant=2)

def get_dists_to_articles(country_clust, country_wiki):
    '''
    For a given DHS cluster, get the distance to each of the wiki 
    articles and return it as a numpy array.
    '''
    dists = []
    for i in range(len(country_wiki)):
        dist = compute_distance((country_clust['lat'], country_clust['lon']),
                                (country_wiki['latitude'].iloc[i], country_wiki['longitude'].iloc[i]))
        dists.append(dist)
    return np.array(dists)

def get_closest_n(dists_to_articles, n=10):
    '''
    Get the n closest articles to a given cluster.
    Returns the indices ofthe articles and the approx distances in km.
    '''
    top_inds = np.argpartition(dists_to_articles, n)[:n]
    return top_inds, dists_to_articles[top_inds]

def get_input_tensor(country_clust,  country_wiki):
    '''
    For a given DHS cluster, get the input tensor that 
    we will feed to the model.
    '''
    embedds = []
    for i, closest_idx in enumerate(country_clust['closest_article_idxs']):
        embedds.append(country_wiki['embedding'].iloc[closest_idx])
    dists = torch.tensor(country_clust['closest_article_dists']).float()
    embedds = torch.flatten(torch.tensor(np.array(embedds))).float()
    return embedds, dists

In [95]:
country_clusts['dists_to_articles'] = \
country_clusts.progress_apply(lambda x : get_dists_to_articles(x, country_wiki), axis=1)

100%|██████████| 492/492 [00:03<00:00, 157.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_clusts['dists_to_articles'] = country_clusts.progress_apply(lambda x : get_dists_to_articles(x, country_wiki), axis=1)


In [115]:
country_clusts[['closest_article_idxs', 'closest_article_dists']] = \
country_clusts.progress_apply(lambda x: get_closest_n(np.array(x['dists_to_articles'])), 
                              axis=1, result_type='expand')

100%|██████████| 492/492 [00:00<00:00, 15842.26it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [129]:
embedds, dists = get_input_tensor(country_clusts.iloc[0], country_wiki)

### Define model hyperparams

In [None]:
embed_size = 768
num_close_arts = 10


DocModelNet = DocModel(embed_size * num_close_arts, 512, 256)

lr = 0.001
criterion = nn.MSELoss()

optimizer = optim.Adam(DocModelNet.parameters(), lr=lr, betas=(0.5, 0.999))

num_epochs = 10

### Train

In [148]:
# avg loss per epoch
epoch_losses = []

# Training Loop
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # list of individ cluster losses for this epoch
    iter_losses = []
    # for each training point
    for i in range(len(country_clusts)):
        # clear gradient fro prev instance
        test_doc_model.zero_grad()
        # Get inputs
        embedd, dists = get_input_tensor(country_clusts.iloc[i], country_wiki)
        # get target wealth value 
        target = torch.tensor(country_clusts['wealthpooled'].iloc[i]).float()
        # get model output
        output = DocModelNet(embedd)
        # Calculate loss based on this output and the loss funct
        err = criterion(output, target)
        # Calculate gradients
        err.backward()
        # Update network params using the optimizer
        optimizer.step()
        # Save Loss for plotting and analysis
        iter_losses.append(err.item())

    epoch_losses.append(np.mean(iter_losses))
    print(f"{epoch=}\t{epoch_losses[epoch]=}")

Starting Training Loop...
epoch=0	epoch_losses[epoch]=3.3236665603452695
epoch=1	epoch_losses[epoch]=1.7000747878438802
epoch=2	epoch_losses[epoch]=0.6187574475565366
epoch=3	epoch_losses[epoch]=0.4425276971557496
epoch=4	epoch_losses[epoch]=0.48080637091821493
epoch=5	epoch_losses[epoch]=0.5059960214872095
epoch=6	epoch_losses[epoch]=0.2931360388643613
epoch=7	epoch_losses[epoch]=0.531150337622665
epoch=8	epoch_losses[epoch]=0.34341576027721343
epoch=9	epoch_losses[epoch]=0.3819233818119504
