# Create final satellite sample
This notebook uses Monica's output and Girish's inspection of empty segments to generate a final sample to share with Digital Globe.  To generate the final sample, I...
1. Drop segments in Assam or Odisha
2. For segments which appear empty, drop them or replace with the full landscan box based on Girish's rec

In [41]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
os.chdir(r"C:\Users\dougj\Documents\Satellite sampling")

In [42]:
segs = gpd.read_file(r"Final sample\140_segments\selected_segments.shp")
segs.drop(columns = ['Name','descriptio', 'timestamp', 'begin', 'end', 'altitudeMo', 'extrude','tessellate', 'visibility', 'drawOrder', 'icon'], inplace = True)

In [43]:
# merge with original dataset which contains state information
selected_ls_boxes = pd.read_csv(r"Final sample\Selected landscan boxes, sub-boxes.csv")
segs = segs.merge(selected_ls_boxes, how = 'left')

In [44]:
# delete rows from Odisha and Assam
segs = segs.loc[(segs.state != 'assam') & (segs.state != 'odisha')]
segs.state.value_counts()

bihar        20
rajasthan    20
MP           20
jharkhand    20
UP           20
Name: state, dtype: int64

In [45]:
# read and clean Girish's inspection results
girish = pd.read_csv(r"Manual\Inspection of empty segments.csv")
girish.rename(columns = {'Segment ID' : 'unique_id', 'Empty Segment':'empty_seg', 'Segment Number': 'num_segmen',
                         'If empty segment, can we survey the entire Landscan Box' : 'survey_box'}, inplace = True )

# there are 3 segments with one structure. change to indicate non empty.  
girish.loc[girish.empty_seg == 'Yes, one structure', 'empty_seg'] = 'No'

# there is one segment with just a powerplant. indicate non-empty
girish.loc[girish.empty_seg == 'Yes, powergrid/thermal plant', 'empty_seg'] = 'Yes'
girish.empty_seg.value_counts()

No     68
Yes    32
Name: empty_seg, dtype: int64

# Replace empty segments with entire boxes based on Girish's rec

In [46]:
# find segments we are going to replace with full boxes
replace_boxes = girish.loc[girish.survey_box == "Yes", 'unique_id']

replace_boxes.drop_duplicates(inplace = True)
replace_boxes = pd.DataFrame(data = replace_boxes)

# get the box geometry for these boxes from the full landscan data
original_boxes = gpd.read_file(r"Final sample\Selected landscan boxes, sub-boxes shape\Selected landscan boxes, sub-boxes shape.shp")
replacements = replace_boxes.merge(original_boxes, how ="left")
replacements.rename(columns = {'geometry':'geom_new'}, inplace = True)

# merge the box info with the segs df, copy the new geom to geometry for full box
segs = segs.merge(replacements, on= 'unique_id', how='left')
segs.loc[~segs.geom_new.isna(), 'geometry'] = segs.geom_new

# drop the duplicate row (since there were two segments per box)
segs.loc[~segs.geom_new.isna(), 'num_segmen'] = 1
segs.drop_duplicates(subset = ['unique_id', 'num_segmen'], inplace = True)

# tag those rows where we replace the segment with a box (so that we can inspect later)
segs['replace_w_box']=0
segs.loc[~segs.geom_new.isna(), 'replace_w_box'] = 1
segs.drop(columns = 'geom_new', inplace = True)

In [47]:
# tag those segments Girish recommends dropping
segs_to_drop = girish.loc[girish.survey_box == "No", ['unique_id', 'num_segmen']]
segs_to_drop['to_drop'] =1
segs= segs.merge(segs_to_drop, on = ['unique_id', 'num_segmen'], how='left')

# before dropping the rows, save the shapefile so that I can inspect in QGIS 
# and look at the original population estimates for the box they were in
segs.to_file(r"Processed\Segments w Girish recs\temp_segs.shp")
segs[segs.to_drop==1][['new_pop', 'num_subboxes']]

Unnamed: 0,new_pop,num_subboxes
9,1501.0,4
13,360.0,1
22,95.0,1
23,26.0,1
29,146.0,1
31,500.75,4
32,1619.0,1
37,1101.0,1
39,675.0,1
41,314.0,1


In [48]:
# drop the segments Girish recommends dropping and save as the final dataset
segs= segs[segs.to_drop !=1]
segs.to_file(r"Final sample\Final selected segments\final_segments.shp")

In [56]:
# create and save dataset of just the landscan box /sub-box
box_ids = segs[['unique_id', 'population', 'state', 'new_pop', 'num_subboxes', 'subbox']]
entire_boxes = original_boxes.merge(box_ids, on='unique_id', how='inner')
entire_boxes.drop_duplicates('unique_id', inplace = True)
entire_boxes.to_file(r"Final sample\Box or subbox for final segments\boxes_final.shp")