In [1]:
import rasterio as rio
import numpy as np
from PIL import Image
import math
# core packages
import pandas as pd
from scipy.sparse import csr_matrix

# image packages
import rasterio as rio
import cv2

# visualization packages
import matplotlib.pyplot as plt

In [2]:
def calc_bounds(file, target_coords, length = 10): 
    with rio.open(file) as src:
        data = src.read()
        data = np.squeeze(data).astype("float32")

    lat = target_coords[0]
    long = target_coords[1]
   
    lat_index = np.arange(0, data.shape[0])
    long_index = np.arange(0, data.shape[1])

    A = src.transform

    ymin, xmin = rio.transform.rowcol(A, long, lat - length)
    ymax, xmax = rio.transform.rowcol(A, long + length, lat)

    return data, xmin, xmax, ymin, ymax

def get_coord_string(coords):
    if coords[0] < 0:
        lat = str(coords[0]*-1) + "S" 
    else:
        lat = str(coords[0]) + "N"
    if coords[1] < 0:
        long = str(coords[1]*-1) + "W"
    else:
        long = str(coords[1]) + "E"
    coord_string = lat + long
    
    return coord_string

def segment(file, target_coords, name):
    data, xmin, xmax, ymin, ymax = calc_bounds(file, target_coords)
    print(data, xmin, xmax, ymin, ymax)
   
    seg_data = data[ymax:ymin, xmin:xmax]
    seg_img = Image.fromarray(seg_data)
    
    coords = get_coord_string(target_coords)
    
    path = "../data/raw/segmented/" + str(name) + "_" + str(coords) + ".tif"
    seg_img.save(path)
    
    return seg_data

def grid_data(data, colname, grid_size, smaller):
        
    if smaller:
        grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_LINEAR)
    else:
        grid = cv2.resize(data, dsize=(grid_size, grid_size), interpolation = cv2.INTER_AREA)

    row = np.arange(grid_size ** 2) // grid_size
    col = np.arange(grid_size ** 2) % grid_size
    
    df = pd.DataFrame(index = row * grid_size + col)
    df["row"] = row
    df["col"] = col
    
    df[colname] = grid.flatten()
    
    return df

the process_data function segments data passed in according to user-specified parameters and returns a dataframe or csv file with the desired data. see parameters below.

- filename: string // path to tif file (oldest data if multiple years of data) 

- target_coords: int tuple (latitude, longtitude) // upper left corner of desired segment 

- grid_size: integer // number of rows/cols in data (sqrt of number of values in dataset)

- name: string // title of dataset, eg. chirps, pop, yield

- years: string (default = "") // must input a list of years of data contained in the set if there's more than one year of data

- agg_data_path: string // path to existing csv file to be merged with new data

- smaller: boolean // false by default, should be true only if data is less granular than the desired grid size

In [3]:
def process_data(filename, name, target_coords = (-10, -60), grid_size = 500, years = "", agg_data_path = "", smaller = False):
    if agg_data_path == "":
        agg_df = pd.DataFrame()
    else:
        agg_df = pd.read_csv(agg_data_path)
    
    if years != "":
        year = years[0]

        for i in range(len(years)):
            file = filename.replace(str(year), str(years[i]))
            col = str(name) + "_" + str(years[i])
            data = segment(file, target_coords, col)
            df = grid_data(data, col, grid_size, smaller) 
            if agg_df.empty:
                agg_df = agg_df.append(df)
            else:
                agg_df = pd.merge(agg_df, df)
    else:
        df = grid_data(segment(filename, target_coords, name), name, grid_size, smaller)
        if agg_df.empty:
                agg_df = agg_df.append(df)
        else:
            agg_df = pd.merge(agg_df, df)
    #agg_df.to_csv("../data/processed/datasets/" + name + "_data", index_label = "id")
    agg_df.to_csv("../data/processed/datasets/pop_0N70W", index_label = "id")
    return agg_df

In [4]:
GRID_SIZE = 500
# LAT, LONG = ("10S", "60W")
# TARGET_COORDS = (-10, -60)
LAT, LONG = ("0N", "70W")
TARGET_COORDS = (0, -70)
AGG_DATA_PATH = "../data/processed/aggregate.csv"


chirps_path = "../data/raw/chirps/chirps-v2.0.2001.tif"
chirp_years = np.arange(2001, 2019)

pop = "../data/raw/population/gpw_v4_population_count_rev11_2000_30_sec.tif"
pop_years = (2000, 2005, 2010, 2015)

yields = "../data/raw/yields/yield_bean.tif"
yield_years = ("bean","carrot","cassava","chickpea","citrus","coffee","groundnut","maize","soybean","sugarcane","tomato","wheat")

cropland = "../data/raw/sa_cropland.tif"
pastures = "../data/raw/sa_pasture.tif"
#calc slope
elevation = "../data/raw/elevation/srtm_25_15.tif"
elevation_years = ("25_15", "25_16", "26_15", "26_16")

pollution_path = "../data/raw/uvai_05.tif"

In [None]:
# example function call:
df = process_data(filename = chirps_path, name = "chirps", 
                  target_coords = TARGET_COORDS, 
                  years = chirp_years, 
                  smaller = False)
# filename: path to first file in dataset 
# name: column name of data - will be name_year if years variable is included 
# years: range of years included, should be in same format as year in filename (i.e. 2018 vs. 18)
# smaller: set to true if data granularity is finer than deforestation data, false otherwise, used to set interpolation type 

In [1]:
# should be able to do pd.merge like below to put datasets together, but you can also include the path  
# to an aggregate file in the function call and the function will return an aggregate dataset with the 
# new data included 

# pd.merge(chirps_70, pop_70).to_csv("../data/processed/datasets/data_0N70W.csv", index = False)

In [14]:
pd.merge(chirps_70, pop_70).to_csv("../data/processed/datasets/data_0N70W.csv", index = False)