In [1]:
import rasterio as rio
import numpy as np
from PIL import Image
import math
# core packages
import pandas as pd
from scipy.sparse import csr_matrix

# image packages
import rasterio as rio
import cv2

# visualization packages
import matplotlib.pyplot as plt

In [2]:
def calc_bounds(file, target_coords, length = 10): 
    with rio.open(file) as src:
        data = src.read()
        data = np.squeeze(data).astype("float32")

    lat = target_coords[0]
    long = target_coords[1]
   
    lat_index = np.arange(0, data.shape[0])
    long_index = np.arange(0, data.shape[1])

    A = src.transform

    ymin, xmin = rio.transform.rowcol(A, long, lat - length)
    ymax, xmax = rio.transform.rowcol(A, long + length, lat)

    return data, xmin, xmax, ymin, ymax

def get_coord_string(coords):
    if coords[0] < 0:
        lat = str(coords[0]*-1) + "S" 
    else:
        lat = str(coords[0]) + "N"
    if coords[1] < 0:
        long = str(coords[1]*-1) + "W"
    else:
        long = str(coords[1]) + "E"
    coord_string = lat + long
    
    return coord_string

def segment(file, target_coords, name):
    data, xmin, xmax, ymin, ymax = calc_bounds(file, target_coords)
    print(data, xmin, xmax, ymin, ymax)
   
    seg_data = data[ymax:ymin, xmin:xmax]
    seg_img = Image.fromarray(seg_data)
    
    coords = get_coord_string(target_coords)
    
    path = "../data/raw/segmented/" + str(name) + "_" + str(coords) + ".tif"
    
    try:
        seg_img.save(path)
    except:
        print("Segment was too large to save")
    finally:
        return seg_data

def split_values(segment, default_vals, cumulative_flg):
    res = dict()
    for val in np.unique(segment):
        if val in default_vals:
            continue
        if cumulative_flg:
            res[val] = (csr_matrix((segment <= val).astype(int)))
        else:
            res[val] = (csr_matrix((segment == val).astype(int)))
    return res

def grid_data(data, colname, grid_size, smaller):
        
    if smaller:
        grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_LINEAR)
    else:
        grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)

    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size
    
    df = pd.DataFrame(index = row * grid_size + col)
    df["row"] = row
    df["col"] = col
    
    df[colname] = grid.flatten()
    
    return df

the process_data function segments data passed in according to user-specified parameters and returns a dataframe or csv file with the desired data. see parameters below.

- filename: string // path to tif file (oldest data if multiple years of data) 

- target_coords: int tuple (latitude, longtitude) // upper left corner of desired segment 

- grid_size: integer // number of rows/cols in data (sqrt of number of values in dataset)

- name: string // title of dataset, eg. chirps, pop, yield

- years: string (default = "") // must input a list of years of data contained in the set if there's more than one year of data

- agg_data_path: string // path to existing csv file to be merged with new data

- split_vals_flg: boolean // true if we want to split data based on array values (ex. 2 = 2002)

- default_vals: list // contains values that should be ignored when splitting values

- cumulative flag: // used to sum all values less than equal to a value when splitting

In [3]:
def process_data(filename, name, target_coords = (-10, -60), grid_size = 500, years = "", agg_data_path = "", split_vals_flg = False, default_vals = [], cumulative_flg = False):
    if agg_data_path == "":
        agg_df = pd.DataFrame()
    else:
        agg_df = pd.read_csv(agg_data_path)
    
    if years != "":
        year = years[0]

        for i in range(len(years)):
            file = filename.replace(str(year), str(years[i]))
            col = str(name) + "_" + str(years[i])
            data_seg = segment(file, target_coords, col)
            if (data_seg.shape[0] < grid_size):
                df = grid_data(data_seg, name, grid_size, True)
            else:
                df = grid_data(data_seg, name, grid_size, False)
            if agg_df.empty:
                agg_df = agg_df.append(df)
            else:
                agg_df = pd.merge(agg_df, df)
    else:
        data_seg = segment(filename, target_coords, name)
        # grid for split values data
        if split_vals_flg:
            split_values_dict = split_values(data_seg, default_vals, cumulative_flg)
            for split_val, data in split_values_dict.items():
                if (data.shape[0] < grid_size):
                    df = grid_data(np.array(data.todense()).astype("float32"), "{}_{}".format(name, split_val), grid_size, True)
                else:
                    df = grid_data(np.array(data.todense()).astype("float32"), "{}_{}".format(name, split_val), grid_size, False)
                if agg_df.empty:
                    agg_df = agg_df.append(df)
                else:
                    agg_df = pd.merge(agg_df, df)
        # grid for non split values data
        else:
            if (data_seg.shape[0] < grid_size):
                df = grid_data(data_seg, name, grid_size, True)
            else:
                df = grid_data(data_seg, name, grid_size, False)
            if agg_df.empty:
                agg_df = agg_df.append(df)
            else:
                agg_df = pd.merge(agg_df, df)
    #agg_df.to_csv("../data/processed/datasets/" + name + "_data", index_label = "id")
    agg_df.to_csv("../data/processed/datasets/amazon.csv", index_label = "id")
    return agg_df

In [4]:
GRID_SIZE = 500
# LAT, LONG = ("10S", "60W")
# TARGET_COORDS = (-10, -60)
LAT, LONG = ("0N", "70W")
TARGET_COORDS = (0, -70)
AGG_DATA_PATH = "../data/processed/aggregate.csv"


chirps_path = "../data/raw/chirps/chirps-v2.0.2001.tif"
chirp_years = np.arange(2001, 2019)

pop = "../data/raw/population/gpw_v4_population_count_rev11_2000_30_sec.tif"
pop_years = (2000, 2005, 2010, 2015)

yields = "../data/raw/yields/yield_bean.tif"
yield_years = ("bean","carrot","cassava","chickpea","citrus","coffee","groundnut","maize","soybean","sugarcane","tomato","wheat")

cropland = "../data/raw/sa_cropland.tif"
pastures = "../data/raw/sa_pasture.tif"
#calc slope
elevation = "../data/raw/elevation/srtm_25_15.tif"
elevation_years = ("25_15", "25_16", "26_15", "26_16")

pollution_path = "../data/raw/uvai_05.tif"

In [5]:
# yields
bean_path = "../data/raw/yields/yield_bean.tif"
carrot_path = "../data/raw/yields/yield_carrot.tif"
cassava_path = "../data/raw/yields/yield_cassava.tif"
chickpea_path = "../data/raw/yields/yield_chickpea.tif"
citrus_path = "../data/raw/yields/yield_citrus.tif"
coffee_path = "../data/raw/yields/yield_coffee.tif"
groundnut_path = "../data/raw/yields/yield_groundnut.tif"
maize_path = "../data/raw/yields/yield_maize.tif"
soybean_path = "../data/raw/yields/yield_soybean.tif"
sugarcane_path = "../data/raw/yields/yield_sugarcane.tif"
tomato_path = "../data/raw/yields/yield_tomato.tif"
wheat_path = "../data/raw/yields/yield_wheat.tif"

# amazon other
travel_time_path = "../data/raw/amazon/travel_time.tif"
dem_path = "../data/raw/amazon/dem.tif"
tree_cover_path = "../data/raw/amazon/tree_cover.tif"
datamask_path = "../data/raw/amazon/datamask.tif"
cropland_path = "../data/raw/amazon/cropland.tif"
pasture_path = "../data/raw/amazon/pasture.tif"

# amazon year
wdpa_path = "../data/raw/amazon/wdpa.tif"
loss_year_path = "../data/raw/amazon/loss_year.tif"

TARGET_COORDS = (0, -70)
process_data(filename = pasture_path, 
                  name = "pasture", 
                  target_coords = TARGET_COORDS,
                  agg_data_path = "../data/processed/datasets/amazon.csv",
                  split_vals_flg = False,
                  default_vals = [],
                  cumulative_flg = False)

[[-3.4028235e+38 -3.4028235e+38 -3.4028235e+38 ... -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]
 [-3.4028235e+38 -3.4028235e+38 -3.4028235e+38 ... -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]
 [-3.4028235e+38 -3.4028235e+38 -3.4028235e+38 ... -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]
 ...
 [-3.4028235e+38 -3.4028235e+38 -3.4028235e+38 ... -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]
 [-3.4028235e+38 -3.4028235e+38 -3.4028235e+38 ... -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]
 [-3.4028235e+38 -3.4028235e+38 -3.4028235e+38 ... -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]] 144 264 271 151


Unnamed: 0,id,row,col,bean,carrot,cassava,chickpea,citrus,coffee,groundnut,...,defor_2011,defor_2012,defor_2013,defor_2014,defor_2015,defor_2016,defor_2017,defor_2018,cropland,pasture
0,0,0,112,0.00,0.00,342.50,0,0.00,363.00,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,1,0,204,0.00,0.00,248.76,0,0.00,378.18,0.00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,2,0,342,311.50,0.00,433.00,0,0.00,482.40,251.30,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000156,0.000000,0.0,0.0
3,3,0,489,512.06,632.88,422.54,0,0.00,451.20,296.38,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,4,0,496,518.40,646.38,428.06,0,0.00,459.02,302.04,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,2628,498,218,814.64,1390.46,868.82,0,0.00,978.58,506.82,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1995,2629,498,274,321.98,508.76,350.02,0,602.44,382.94,213.30,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002969,0.005625,0.0,0.0
1996,2630,498,454,533.62,919.08,431.46,0,1245.80,755.30,283.20,...,0.000000,0.000313,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1997,2631,499,385,269.02,482.64,105.82,0,545.72,350.80,61.50,...,0.001875,0.002656,0.013281,0.005313,0.000000,0.002188,0.016250,0.000000,0.0,0.0


In [1]:
# should be able to do pd.merge like below to put datasets together, but you can also include the path  
# to an aggregate file in the function call and the function will return an aggregate dataset with the 
# new data included 

# pd.merge(chirps_70, pop_70).to_csv("../data/processed/datasets/data_0N70W.csv", index = False)

In [14]:
pd.merge(chirps_70, pop_70).to_csv("../data/processed/datasets/data_0N70W.csv", index = False)