In [2]:
# import libraries
import os
import pandas as pd
import pathlib
import glob
import pickle

# set working directory and data folder
working_directory = r"/home/dr/phia data"
os.chdir(working_directory)
data_folder = r"/home/dr/phia data/data_folder"

# set random seed
random_seed = 10

# function for finding and removing element in a list
def find_and_remove_element(element, listname):
    s = set(listname)
    if element in s:
        listname.remove(element)
    else:
        print('element not in the list')
        
 # importing HIV test results for all countries from the data folder
BIO_data_ = {}
for file in pathlib.Path(data_folder).glob('**/*adultbio*.dta'):
    data = os.fspath(file)
    name = file.name
    col_names = pd.read_stata(data, convert_categoricals=False).columns.tolist()
    error = True
    while error:
        try:
            pd.read_stata(data, columns=col_names)
        except ValueError as e:
            string = str(e)
            var = string.split()[4]
            find_and_remove_element(var, col_names)
        else:
            error = False
    BIO_data_[name[0:2]] = pd.read_stata(data, columns=col_names)
    BIO_data_[name[0:2]]['country'] = name[0:2]
    
  # importing individual data for all countries from the data folder
IND_data_ = {}
for file in pathlib.Path(data_folder).glob('**/*adultind*.dta'):
    data = os.fspath(file)
    name = file.name
    col_names = pd.read_stata(data, convert_categoricals=False).columns.tolist()
    error = True
    while error:
        try:
            pd.read_stata(data, columns=col_names)
        except ValueError as e:
            string = str(e)
            var = string.split()[4]
            find_and_remove_element(var, col_names)
        else:
            error = False
    IND_data_[name[0:2]] = pd.read_stata(data, columns=col_names)
    IND_data_[name[0:2]]['country'] = name[0:2]
 
   # merge individuals per country with HIV status according to specific columns
INDBIO = {}

for country in BIO_data_.keys():
    INDBIO[country] = IND_data_[country].merge(BIO_data_[country].set_index(['personid', 'country']), right_index=True, left_on=['personid', 'country'], how='inner')
        
        

INDBIO_resampled = {}

# sample individuals per country according to hiv weights
for country in BIO_data_.keys():
    INDBIO_resampled[country] = INDBIO[country].sample(frac=1, weights='btwt0', replace=True, random_state=random_seed)
           
        
# concat countries datasets
INDBIO_concat = pd.concat(INDBIO_resampled.values(), join='inner', ignore_index=True)
f = open("Transformed_data\INDBIO_step1.1.pkl", 'wb')
pickle.dump(INDBIO_concat, f)

f.close()
