In [123]:
import geopandas as gpd
import fiona
import pandas as pd

from lib.center_geo import center_rect

In [124]:
def poly_to_lat_long_and_center(poly):

    lats = list(poly.exterior.coords.xy[1])
    longs = list(poly.exterior.coords.xy[0])
    return center_rect((list(zip(lats, longs)))[:-1])

In [125]:
def poly_to_lat_long(poly):

    lats = list(poly.exterior.coords.xy[1])
    longs = list(poly.exterior.coords.xy[0])
    return list(zip(lats, longs))

In [126]:
gpkg_data_dir = "./data/gpkg_data/"

ndr_file_name = "ipbes_ndr_summary_oct_24_md5_da95dbdccf6eaec2b4e1b05437d97b39.gpkg"
fiona.listlayers(gpkg_data_dir + ndr_file_name)

['grid_1_degree', 'correct_countries_on_grid_with_fid', 'Final_WQR_forviz']

In [127]:
geometry = gpd.read_file(gpkg_data_dir + ndr_file_name, layer='grid_1_degree')
geometry = geometry [['GRIDCODE', 'geometry']]
# Fid is gridcode -1 
geometry['GRIDCODE'] = geometry['GRIDCODE'] -1 
geometry.columns = ['fid', 'geometry']
geometry.head()

Unnamed: 0,fid,geometry
0,0,"POLYGON ((-179 89, -180 89, -180 90, -179 90, ..."
1,1,"POLYGON ((-178 89, -179 89, -179 90, -178 90, ..."
2,2,"POLYGON ((-177 89, -178 89, -178 90, -177 90, ..."
3,3,"POLYGON ((-176 89, -177 89, -177 90, -176 90, ..."
4,4,"POLYGON ((-175 89, -176 89, -176 90, -175 90, ..."


Get center points of rectangles

In [128]:
geometry['geometry'] = geometry['geometry'].apply(lambda x: poly_to_lat_long_and_center(x))
geometry.head()

Unnamed: 0,fid,geometry
0,0,"(89.50001903460208, -179.5)"
1,1,"(89.50001903460208, -178.50000000000003)"
2,2,"(89.50001903460208, -177.5)"
3,3,"(89.50001903460208, -176.5)"
4,4,"(89.50001903460208, -175.49999999999997)"


In [129]:
geometry[['lat', 'lng']] = pd.DataFrame(geometry['geometry'].tolist(), index=geometry.index)#.drop(columns='geometry')
geometry = geometry.drop(columns=['geometry'])
geometry.head()

Unnamed: 0,fid,lat,lng
0,0,89.500019,-179.5
1,1,89.500019,-178.5
2,2,89.500019,-177.5
3,3,89.500019,-176.5
4,4,89.500019,-175.5


In [130]:
ndr_table = gpd.read_file(gpkg_data_dir + ndr_file_name, layer='Final_WQR_forviz').drop(columns = ['geometry', 'PN_c', 'PN_1', 'PN_3', 'PN_5', 'PNpop_c_norm'])
ndr_table.head()

Unnamed: 0,fid,NC_c,NC_1,NC_3,NC_5,UN_c,UN_1,UN_3,UN_5,pop_c,pop_1,pop_3,pop_5
0,9925,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,
1,9926,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,
2,9927,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,
3,9928,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,
4,9929,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,


In [131]:
ndr_table = ndr_table.merge(geometry,on='fid')
print(ndr_table.shape)
ndr_table.head()

(13215, 15)


Unnamed: 0,fid,NC_c,NC_1,NC_3,NC_5,UN_c,UN_1,UN_3,UN_5,pop_c,pop_1,pop_3,pop_5,lat,lng
0,9925,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,,62.500893,25.5
1,9926,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,,62.500893,26.5
2,9927,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,,62.500893,27.5
3,9928,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,,62.500893,28.5
4,9929,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0,,,,62.500893,29.5


Then same for cv and poll:

In [132]:
cv_file_name = "CV_outputs_1degree_2019_02_11_md5_c708cc320ea288938ef7d8a8db4e9c35.gpkg"
cv_table = gpd.read_file(gpkg_data_dir + cv_file_name, layer='Final_CRR_forviz').drop(columns = ['geometry', 'PN_c', 'PN_1', 'PN_3', 'PN_5', 'PNpop_c_norm'])
cv_table = cv_table.merge(geometry,on='fid')
print(cv_table.shape)
cv_table.head()

(3485, 15)


Unnamed: 0,fid,NC_c,NC_1,NC_3,NC_5,UN_c,UN_1,UN_3,UN_5,pop_c,pop_1,pop_3,pop_5,lat,lng
0,9715,-1.0,-1.0,-1.0,-1.0,2.587505,2.904376,3.107431,3.260052,2.137938,2.137938,2.137938,2.137938,63.500871,175.5
1,9716,-1.0,-1.0,-1.0,-1.0,2.885258,3.238593,3.465013,3.635197,11.744734,11.744734,11.744734,11.744734,63.500871,176.5
2,9717,-1.0,-1.0,-1.0,-1.0,2.976339,3.340827,3.574395,3.89204,39.133438,39.133438,39.133438,39.133438,63.500871,177.5
3,9718,-1.0,-1.0,-1.0,-1.0,2.725824,3.059633,3.273542,3.564452,30.200581,30.200581,30.200581,30.200581,63.500871,178.5
4,9719,-1.0,-1.0,-1.0,-1.0,2.948429,3.3095,3.540878,3.855545,2.611755,2.611755,2.611755,2.611755,63.500871,179.5


In [133]:
poll_file_name = "ipbes_pollination_summary_hg_2018-12-06_17%3A36_-0800_340bfd15b50c.gpkg"
poll_table = gpd.read_file(gpkg_data_dir + poll_file_name, layer='Final_POLL_forviz').drop(columns = ['geometry', 'PN_c', 'PN_1', 'PN_3', 'PN_5', 'PNpop_c_norm'])
poll_table = poll_table.merge(geometry,on='fid')
print(poll_table.shape)
poll_table.head()

(8960, 15)


Unnamed: 0,fid,NC_c,NC_1,NC_3,NC_5,UN_c,UN_1,UN_3,UN_5,pop_c,pop_1,pop_3,pop_5,lat,lng
0,8845,0.333333,0.331424,0.33224,0.306977,0.0,0.002181,0.001248,0.030107,125575,,,,65.500823,25.5
1,9204,0.859045,0.855574,0.855591,0.80346,479.207931,491.686198,491.623908,649.949682,69314,78508.0,61138.0,91201.0,64.500848,24.5
2,9205,0.522512,0.49796,0.500777,0.467731,23750.55267,24415.330552,24589.665243,25675.645555,0,,,,64.500848,25.5
3,9206,0.896613,0.770584,0.772471,0.688556,630.529173,1248.400221,1264.336585,1646.128832,0,0.0,0.0,0.0,64.500848,26.5
4,9207,1.0,0.994709,0.994955,0.994255,0.0,0.002589,0.002469,0.002812,45897,,,,64.500848,27.5


# Preprocessing

In [134]:
cv_table.fillna(value=0, inplace=True)
ndr_table.fillna(value=0, inplace=True)
poll_table.fillna(value=0, inplace=True)

In [135]:
cols = ['pop_c', 'pop_1', 'pop_3', 'pop_5']
print("Number of rows before: " + str(cv_table.shape[0]))
cv_table = cv_table.loc[(cv_table[cols]!=0).any(axis=1)]
print("Number of rows after: " + str(cv_table.shape[0]))

Number of rows before: 3485
Number of rows after: 1878


In [136]:
print("Number of rows in ndr_table before: " + str(ndr_table.shape[0]))
ndr_table = ndr_table.loc[(ndr_table[cols]!=0).any(axis=1)]
print("Number of rows in ndr_table after: " + str(ndr_table.shape[0]))

print("")

print("Number of rows in poll_table before: " + str(poll_table.shape[0]))
poll_table = poll_table.loc[(poll_table[cols]!=0).any(axis=1)]
print("Number of rows in poll_table after: " + str(poll_table.shape[0]))

Number of rows in ndr_table before: 13215
Number of rows in ndr_table after: 11855

Number of rows in poll_table before: 8960
Number of rows in poll_table after: 6292


In [137]:
cv_table.drop(columns=['fid'], inplace=True)
ndr_table.drop(columns=['fid'], inplace=True)
poll_table.drop(columns=['fid'], inplace=True)

In [138]:
cv_table.reset_index(drop=True, inplace=True)
ndr_table.reset_index(drop=True, inplace=True)
poll_table.reset_index(drop=True, inplace=True)

In [139]:
save_dir = "./data/preprocessed_data/updated_data3/"

In [140]:
cv_table.to_csv(save_dir + "cv_table_preprocessed.csv", index=False)
ndr_table.to_csv(save_dir + "ndr_table_preprocessed.csv", index=False)
poll_table.to_csv(save_dir + "poll_table_preprocessed.csv", index=False)

In [141]:
cv_table.sample()


Unnamed: 0,NC_c,NC_1,NC_3,NC_5,UN_c,UN_1,UN_3,UN_5,pop_c,pop_1,pop_3,pop_5,lat,lng
127,-1.0,-1.0,-1.0,-1.0,3.095577,3.717592,3.900182,4.047963,0.036228,0.036228,0.036228,0.036228,55.501018,-160.5
