In [40]:
import os

import numpy as np
import pandas as pd
import torch.nn as nn


In [35]:
WIKI_PATH = os.path.abspath('../raw/wikipedia/')

In [8]:
# load article meta data of IDs, names, categories, geolocations
article_meta_data = pd.read_csv(WIKI_PATH + '/All_Image_Coordinates_2.csv')
print(article_meta_data.iloc[:5])


   id                                          name         category  \
0   0  A" Fort and Battery Hill Redoubt-Camp Early"         building   
1   1                          Lockkeeper's" House"            house   
2   2                                       &moshik       restaurant   
3   3                                      'A'akapa  populated place   
4   4                                     'Abadilah  populated place   

         lat         lon  
0  38.789444  -77.427778  
1  40.885278  -92.196944  
2  52.376622    4.905381  
3  -8.819444 -140.130556  
4  25.438333   56.191389  


In [37]:
# validate .npy files
# produce and save list of available, properly linked .npy files assuming IDs match up
def validate_npy_files(ids, path_to_npy):
    # ids: numpy array of article ID indices
    
    validated_ids = []
    for id in np.nditer(ids):
        npy_path_i = os.path.join(path_to_npy, str(id) + '.npy')
        if os.path.exists(npy_path_i):
            validated_ids.append(id)
    
    validated_ids = pd.DataFrame(validated_ids, columns=['Article IDs'])
    return validated_ids

ids = article_meta_data.loc[:, 'id'].to_numpy()
validated_ids = validate_npy_files(ids, os.path.join(WIKI_PATH, 'doc2vec_embeddings'))
print('{} available .npy files'.format(len(validated_ids)))
validated_ids.to_csv(WIKI_PATH + '/available_npy_ids.csv')


56721 available .npy files


In [44]:
# build model (just copy Sheehan, use softmax with cross-entropy or MSE for education level percentages)
class WikiEmbRegressor(nn.Module):
    def __init__(emb_size=300, n_embs=10, ave_embs=True, concat=False, MEL_IMR=True):
        '''
        NN model for regression of maternal education level (MEL) and infant mortality rate (IMR)
        emb_size: (int) size of input Wikipedia article embeddings
        n_embs: (int) number of articles used in a single forward run
        ave_embs: (Bool) whether to average embeddings or to concatenate them (not sure what concat was used for in original code, ave_embs seems to imply NOT concat)
        concat: (Boolean) whether to concatenate embeddings or to simply use one of them?
        MEL_IMR: (Boolean) whether to predict MEL (True) or IMR (False)
        '''
        
        super(WikiEmbRegressor, self).__init__()
        self.emb_size = emb_size
        self.n_embs = n_embs
        self.ave_embs = ave_embs
        np.random.seed(1234)
        self.concat = concat
        self.MEL_IMR = MEL_IMR
        
        if self.concat:
            self.input_shape = self.emb_size * self.n_embs + self.n_embs
        else:
            self.input_shape = self.emb_size + 1
        
        if MEL_IMR:
            self.model = nn.Sequential(
                            nn.Linear(self.input_shape, 512), nn.ReLU(),
                            nn.Linear(512, 256), nn.ReLU(),
                            nn.Linear(256, 32), nn.ReLU(),
                            nn.Linear(32, 4)
            )
        else:
            self.model = nn.Sequential(
                            nn.Linear(self.input_shape, 512), nn.ReLU(),
                            nn.Linear(512, 256), nn.ReLU(),
                            nn.Linear(256, 32), nn.ReLU(),
                            nn.Linear(32, 1)
            )
        
        self.optimizer = torch.optim.Adam(self.model.parameters())
        
    def forward(embs, dists):
        # embs: Torch tensor, shape (batch_size, n_embs, emb_size)
        # dists: Torch tensor, shape (batch_size, n_embs, 1)
        
        if self.ave_embs:
            embs = embs.mean(dim=1)
            dists = dists.mean(dim=1)
        else:
            embs = embs.reshape((embs.shape[0], -1))
            dists = dists.reshape((dists.shape[0], -1))
            
        inputs = torch.concat([embs, dists], dim=-1)
        
        return self.model(inputs)
        
#         model = Sequential()
#         if concat:
#             model.add(Dense(512, input_shape=(input_shape,), kernel_initializer='normal', activation='sigmoid'))
#         else:
#             model.add(Dense(512, input_shape=(300,), kernel_initializer='normal', activation='sigmoid'))
#         model.add(Dense(256, kernel_initializer='normal', activation='sigmoid'))
#         model.add(Dense(32, kernel_initializer='normal', activation='sigmoid'))
#         model.add(Dense(1, kernel_initializer='normal'))

#         model.compile(loss='mean_squared_error', optimizer='adam', metrics=[coeff_determination, 'mae'])
#         print(model.summary())

# get closest N articles in available article set to clusters, take mean embedding with mean distance to start
# build train/val/test data set, save data sets in .csv files, but don't reload, check if already loaded
# training loop, model saving, checkpointing and validation/performance
# GPU integration
# determine countries to start with: English speaking

# make model diagram
# outline slides
# redo baselines with article embeddings