# 2b. Satellite Featurizations Part 2 -- Feature Aggregation (except for India)
This notebook should be run after the files obtained from the previous notebook are featurized via siml.berkeley.edu.

In [48]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import time
warnings.filterwarnings('ignore')
from importlib import reload

In [49]:
TILES_FOLDER = '/data/mosaiks/sampled_tiles/'
FEATURES_FOLDER = '/data/mosaiks/replication/features/'

In [50]:
# RUN THIS CELL FOR US PUMAS
FEATURES_INFOLDER = TILES_FOLDER + 'us/featurized/'
#SAMPLE_FNAME = TILES_FOLDER + 'us/sampled_tiles.csv'
SAMPLE_FNAME = '/data/mosaiks/replication/sampled_tiles/us/sampled_tiles.csv'
SHAPEFILE_IDS = ['Id']
OUT_FNAME = FEATURES_FOLDER + 'mosaiks_features_by_puma_us.csv'

In [25]:
# RUN THIS CELL FOR MEXICO MUNICIPALITIES
FEATURES_INFOLDER = TILES_FOLDER + 'mexico/featurized/'
SAMPLE_FNAME = TILES_FOLDER + 'mexico/sampled_tiles.csv'
SHAPEFILE_IDS = ['municipality']
OUT_FNAME = FEATURES_FOLDER + 'mosaiks_features_by_municipality_mexico.csv'

In [77]:
# RUN THIS CELL FOR INDIA
# Esther TODO

In [45]:
# RUN THIS CELL FOR DHS CLUSTERS
dhs_country = 'peru'
FEATURES_INFOLDER = TILES_FOLDER + 'dhs/' + dhs_country + '/featurized/'
SAMPLE_FNAME = TILES_FOLDER + 'dhs/' + dhs_country + '/sampled_tiles.csv'
SHAPEFILE_IDS = ['cluster']
OUT_FNAME = FEATURES_FOLDER + 'dhs/mosaiks_features_by_cluster_' + dhs_country + '.csv'

In [46]:
# Read raw mosaiks features
mosaiks = []
for fname in os.listdir(FEATURES_INFOLDER):
    if fname[-4:] == '.csv':
        mosaiks.append(pd.read_csv(FEATURES_INFOLDER + '/' + fname))
mosaiks = pd.concat(mosaiks).drop_duplicates(subset=['Lat', 'Lon'])
mosaiks.columns = ['Latitude', 'Longitude'] + ['Feature' + str(i) for i in range(len(mosaiks.columns) - 2)]

# Merge mosaiks features to original sample file
sample = pd.read_csv(SAMPLE_FNAME)
mosaiks = mosaiks.merge(sample, how='inner', on=['Latitude', 'Longitude'])

In [47]:
# Take weighted average of MOSAIKs features in each region
for col in [c for c in mosaiks.columns if 'Feature' in c]:
    mosaiks[col] = mosaiks[col]*mosaiks['weight']
average_features = mosaiks.groupby(SHAPEFILE_IDS, as_index=False).agg('sum')
for col in [c for c in average_features.columns if 'Feature' in c]:
    average_features[col] = average_features[col]/average_features['weight']
average_features = average_features.drop(['weight', 'Latitude', 'Longitude'], axis=1)
average_features.to_csv(OUT_FNAME, index=False)