In [79]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
import torch.optim as optim

In [38]:
class DocModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(DocModel, self).__init__()
        self.input_size = input_size
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        
        self.docvecpipeline = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size1),
            nn.ReLU(True),
            nn.Linear(self.hidden_size1, self.hidden_size2),
            nn.ReLU(True)
        )
        
        self.regressor = nn.Sequential(
            nn.Linear(self.hidden_size2, 1)
        )
        
    def forward(self, invecs, return_hidden=False):
        hidden = self.docvecpipeline(invecs)
        output = self.regressor(hidden)
        if return_hidden:
            return output, hidden
        else:
            return output

In [45]:
DocModelNet = DocModel(768, 512, 256)

lr = 0.001
criterion = nn.MSELoss()

optim = optim.Adam(DocModelNet.parameters(), lr=lr, betas=(0.5, 0.999))

num_epochs = 10

In [14]:
country = 'rwanda'
country_abrev = 'RWA'
year = 2010

In [24]:
country_wiki = pd.read_csv(f'articles/{country}_Wiki.csv')
dhs_clusts = dhs_clusts = pd.read_csv('data/dhs_clusters.csv')
country_clusts = country_clusts = dhs_clusts[(dhs_clusts['country']== country) & (dhs_clusts['year']== year)]
# Convert string to np array because it was stored stupidly. Will fix later
country_wiki['embedding'] = country_wiki['embedding'].apply(lambda x: np.fromstring(x[1:-1], 
                                                                                    sep=' '))

In [128]:
def compute_distance(c1, c2):
    '''
    Compute approx distance between two coords given in (lat, long)
    format. 
    '''
    # approximate radius of earth in km
    R = 6373.0
    lat1 = np.radians(c1[0])
    lon1 = np.radians(c1[1])
    lat2 = np.radians(c2[0])
    lon2 = np.radians(c2[1])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

def test_compute_dist():
    '''
    Unit test for compute_distance function.
    
    Uses Berkeley School of Infromation and Moscone Center South in San Fransisco
    as a test case. Distance from Google maps. 
    '''
    moscone_south = (37.783957939867015, -122.40107973374062) # lat, long
    i_school = (37.871363468005065, -122.25852213941603) 
    correct_dist = 15.6 # in km
    computed_dist = compute_distance(moscone_south, i_school)
    np.testing.assert_approx_equal(computed_dist, correct_dist, significant=2)

def get_dists_to_articles(country_clust, country_wiki):
    '''
    For a given DHS cluster, get the distance to each of the wiki 
    articles and return it as a numpy array.
    '''
    dists = []
    for i in range(len(country_wiki)):
        dist = compute_distance((country_clust['lat'], country_clust['lon']),
                                (country_wiki['latitude'].iloc[i], country_wiki['longitude'].iloc[i]))
        dists.append(dist)
    return np.array(dists)

def get_closest_n(dists_to_articles, n=10):
    '''
    Get the n closest articles to a given cluster.
    Returns the indices ofthe articles and the approx distances in km.
    '''
    top_inds = np.argpartition(dists_to_articles, n)[:n]
    return top_inds, dists_to_articles[top_inds]

def get_input_tensor(country_clust,  country_wiki):
    '''
    For a given DHS cluster, get the input tensor that 
    we will feed to the model.
    '''
    embedds = []
    for i, closest_idx in enumerate(country_clust['closest_article_idxs']):
        embedds.append(country_wiki['embedding'].iloc[closest_idx])
    dists = torch.tensor(country_clust['closest_article_dists']).float()
    embedds = torch.flatten(torch.tensor(np.array(embedds))).float()
    return embedds, dists

In [95]:
country_clusts['dists_to_articles'] = \
country_clusts.progress_apply(lambda x : get_dists_to_articles(x, country_wiki), axis=1)

100%|██████████| 492/492 [00:03<00:00, 157.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country_clusts['dists_to_articles'] = country_clusts.progress_apply(lambda x : get_dists_to_articles(x, country_wiki), axis=1)


In [115]:
country_clusts[['closest_article_idxs', 'closest_article_dists']] = \
country_clusts.progress_apply(lambda x: get_closest_n(np.array(x['dists_to_articles'])), 
                              axis=1, result_type='expand')

100%|██████████| 492/492 [00:00<00:00, 15842.26it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [129]:
embedds, dists = get_input_tensor(country_clusts.iloc[0], country_wiki)

In [None]:
# TODO: 
# Use the closest n to create input vectors for each cluster represented as 
# concatenations of the n closest article vectors and perhaps also the distance values
# 
# Train model using those inputs and wealth scores as outputs. 


#get_closest_n(country_clusts['dists_to_articles'].iloc[0], 10)

In [40]:
test_doc_model(torch.tensor(country_wiki['embedding'].iloc[0]).float())

tensor([0.0725], grad_fn=<AddBackward0>)

In [None]:
# Training Loop

print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # For each batch in the dataloader
    for i, data in enumerate(dataloader, 0):

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        # Format batch
        real_cpu = data[0].to(device)
        cond = torch.Tensor(infos[i]).to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
        # Forward pass real batch through D
        output = netD(real_cpu, cond.reshape((batch_size, ninfo))).view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        noise = torch.randn(b_size, nz, 1, 1, device=device)
        # Generate batch of random conditional vectors
        idx = probs.multinomial(1)
        rand_cond = infos[idx]
        rand_cond = rand_cond.reshape(rand_cond.shape[1:]).to(device)
        #rand_title = titles[idx]
        #rand_title = rand_title.reshape(rand_title.shape[1:]).to(device)
        
        # Generate fake image batch with G
        fake = netG(noise, rand_cond.reshape((batch_size, ninfo))) #, 
                   #rand_title.reshape((batch_size, ntitlevect)))
        label.fill_(fake_label)
        # Classify all fake batch with D
        output = netD(fake.detach(), rand_cond.reshape((batch_size, ninfo))).view(-1)
        #  rand_title.reshape((batch_size, ntitlevect))
        # Calculate D's loss on the all-fake batch
        
        errD_fake = criterion(output, label)
        # Calculate the gradients for this batch
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Add the gradients from the all-real and all-fake batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output = netD(fake, rand_cond.reshape((batch_size, ninfo))).view(-1)
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(dataloader),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 50 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise, fixed_cond.reshape((batch_size, ninfo))).detach().cpu() #,
                           #fixed_titles.reshape((batch_size, ntitlevect))).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1