### Imports

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
import torch.optim as optim

  from pandas import Panel


### Model Definition

In [21]:
class DocModel(nn.Module):
    def __init__(self, input_size, hidden_sizes):
        super(DocModel, self).__init__()
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        
        modlist = []
        for i in range(len(hidden_sizes)):
            if i == 0:
                modlist.append(nn.Sequential(
                    nn.Linear(self.input_size, self.hidden_sizes[i]),
                    nn.LeakyReLU()))
            else:
                modlist.append(nn.Sequential(
                    nn.Linear(self.hidden_sizes[i-1], self.hidden_sizes[i]),
                    nn.LeakyReLU()))
                               
        self.docvecpipeline = nn.ModuleList(modlist)
        
        self.regressor = nn.Sequential(
            nn.Linear(self.hidden_sizes[-1], 1)
        )
        
    def forward(self, invecs):
        hidden = invecs
        for i, layer in enumerate(self.docvecpipeline):
            hidden = layer(hidden)
        output = self.regressor(hidden)
        return output

### Define Data to Use and Data Prep Pipeline

In [3]:
country = 'rwanda'
country_abrev = 'RWA'
year = 2010

In [4]:
country_wiki = pd.read_csv(f'articles/{country}_Wiki.csv')
dhs_clusts = dhs_clusts = pd.read_csv('data/dhs_clusters.csv')
country_clusts = country_clusts = dhs_clusts[(dhs_clusts['country']== country) & (dhs_clusts['year']== year)]
# Convert string to np array because it was stored stupidly. Will fix later
country_wiki['embedding'] = country_wiki['embedding'].apply(lambda x: np.fromstring(x[1:-1], 
                                                                                    sep=' '))

In [9]:
def compute_distance(c1, c2):
    '''
    Compute approx distance between two coords given in (lat, long)
    format. 
    '''
    # approximate radius of earth in km
    R = 6373.0
    lat1 = np.radians(c1[0])
    lon1 = np.radians(c1[1])
    lat2 = np.radians(c2[0])
    lon2 = np.radians(c2[1])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def test_compute_dist():
    '''
    Unit test for compute_distance function.
    
    Uses Berkeley School of Information and Moscone Center South in San Fransisco
    as a test case. Distance from Google maps. 
    '''
    moscone_south = (37.783957939867015, -122.40107973374062) # lat, long
    i_school = (37.871363468005065, -122.25852213941603) 
    correct_dist = 15.6 # in km
    computed_dist = compute_distance(moscone_south, i_school)
    np.testing.assert_approx_equal(computed_dist, correct_dist, significant=2)

def get_dists_to_articles(country_clust, country_wiki):
    '''
    For a given DHS cluster, get the distance to each of the wiki 
    articles and return it as a numpy array.
    '''
    dists = []
    for i in range(len(country_wiki)):
        dist = compute_distance((country_clust['lat'], country_clust['lon']),
                                (country_wiki['latitude'].iloc[i], country_wiki['longitude'].iloc[i]))
        dists.append(dist)
    return np.array(dists)

def get_closest_n(dists_to_articles, n=10):
    '''
    Get the n closest articles to a given cluster.
    Returns the indices ofthe articles and the approx distances in km.
    '''
    top_inds = np.argsort(dists_to_articles)[:n]
    #np.argpartition(dists_to_articles, n)[:n]
    return top_inds, dists_to_articles[top_inds]

def get_input_tensor(country_clust,  country_wiki):
    '''
    For a given DHS cluster, get the input tensor that 
    we will feed to the model.
    '''
    embedds = []
    for i, closest_idx in enumerate(country_clust['closest_article_idxs']):
        embedds.append(country_wiki['embedding'].iloc[closest_idx])
    dists = torch.tensor(country_clust['closest_article_dists']).float()
    embedds = torch.flatten(torch.tensor(np.array(embedds))).float()
    return embedds, dists

In [6]:
country_clusts['dists_to_articles'] = \
country_clusts.progress_apply(lambda x : get_dists_to_articles(x, country_wiki), axis=1)

100%|██████████| 492/492 [00:03<00:00, 147.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_clusts['dists_to_articles'] = \


In [10]:
country_clusts[['closest_article_idxs', 'closest_article_dists']] = \
country_clusts.progress_apply(lambda x: get_closest_n(np.array(x['dists_to_articles'])), 
                              axis=1, result_type='expand')

100%|██████████| 492/492 [00:00<00:00, 9121.76it/s]


In [11]:
country_clusts[['input_embedds', 'input_dists']] = \
country_clusts.progress_apply(lambda clust: get_input_tensor(clust, country_wiki), 
                     axis=1, result_type='expand')

100%|██████████| 492/492 [00:00<00:00, 1875.67it/s]


### Define model hyperparams

In [23]:
embed_size = 768
num_close_arts = 10

#DocModelNet = DocModel(10, [5, 2])

DocModelNet = DocModel(embed_size * num_close_arts, [512, 512, 256, 32])

lr = 0.0001
criterion = nn.MSELoss()

optimizer = optim.Adam(DocModelNet.parameters(), lr=lr)

num_epochs = 10

device = ("cuda" if torch.cuda.is_available() else "cpu")

DocModelNet.to(device)

DocModel(
  (docvecpipeline): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=7680, out_features=512, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
    )
    (1): Sequential(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
    )
    (2): Sequential(
      (0): Linear(in_features=512, out_features=256, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
    )
    (3): Sequential(
      (0): Linear(in_features=256, out_features=32, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
    )
  )
  (regressor): Sequential(
    (0): Linear(in_features=32, out_features=1, bias=True)
  )
)

### Train

In [24]:
# avg loss per epoch
epoch_losses = []

# Training Loop
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # list of individ cluster losses for this epoch
    iter_losses = []
    # for each training point
    for i in range(len(country_clusts)):
        # clear gradient fro prev instance
        DocModelNet.zero_grad()
        # Get inputs
        #embedd = country_clusts['input_dists'].iloc[i].to(device)
        embedd = country_clusts['input_embedds'].iloc[i].to(device)
        # get target wealth value 
        target = torch.tensor(country_clusts['wealthpooled'].iloc[i]).float().to(device)
        # get model output
        output = DocModelNet(embedd)
        # Calculate loss based on this output and the loss funct
        err = criterion(output[0], target)
        # Calculate gradients
        err.backward()
        # Update network params using the optimizer
        optimizer.step()
        # Save Loss for plotting and analysis
        iter_losses.append(err.item())

    epoch_losses.append(np.mean(iter_losses))
    print(f"epoch {epoch}    avg loss {epoch_losses[epoch]}\n")

Starting Training Loop...
epoch 0    avg loss 0.2547353473017306

epoch 1    avg loss 0.20810543663274036

epoch 2    avg loss 0.17284417288137788

epoch 3    avg loss 0.14929311266750217

epoch 4    avg loss 0.13553155002154804

epoch 5    avg loss 0.15152846720673865

epoch 6    avg loss 0.09661955052957016

epoch 7    avg loss 0.09213685304240332

epoch 8    avg loss 0.07858333206451555

epoch 9    avg loss 0.07665008246217685



In [25]:
def get_pred_wealth(clust, model, country_wiki):
    #embedd = clust['input_dists'].to(device)
    embedd = clust['input_embedds'].to(device)
    
    #print(embedd)
    output = model(embedd)
    return output.item()

In [26]:
country_clusts['pred_wealth'] = \
country_clusts.apply(lambda clust : get_pred_wealth(clust, DocModelNet, country_wiki), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_clusts['pred_wealth'] = \


In [29]:
country_clusts['pred_wealth'].describe()

count    492.000000
mean      -0.465055
std        0.344402
min       -0.767587
25%       -0.630059
50%       -0.578560
75%       -0.497021
max        0.753728
Name: pred_wealth, dtype: float64

In [28]:
country_clusts['wealthpooled'].describe()

count    492.000000
mean      -0.481522
std        0.497298
min       -1.041967
25%       -0.765138
50%       -0.647184
75%       -0.463005
max        1.894964
Name: wealthpooled, dtype: float64