# Import

In [None]:
import gzip
import os
import shutil
import tempfile
import netCDF4
from netCDF4 import Dataset
import numpy as np
import pandas as pd
import math
import requests
import time
import random
from PIL import Image, ImageDraw, ImageFilter
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

# Bioclim - variables - functions - calculation

In [None]:
val_nam_list = ['cld', 'dtr', 'frs', 'pet', 'pre', 'tmn', 'tmp', 'tmx', 'vap', 'wet']
fold_n = '/Users/davidschildberger/03_LeWagon_Datasets/CRU_raw_data_2021/'

In [None]:
def get_filename(feat_name):
    fi_n = os.listdir(fold_n)
    fi_n.remove('.DS_Store')
    for i in fi_n: 
        if(feat_name in i) : 
            return i

In [None]:
def open_netcdf(fname):
    with gzip.open((fold_n+fname), 'rb') as f:
        tmp = tempfile.NamedTemporaryFile(delete=False)
        shutil.copyfileobj(f, tmp)

        f.close()
        tmp.close()
        
        data = netCDF4.Dataset(tmp.name)
        os.unlink(tmp.name)
    return data

## Bio 1 / Annual Mean Temperature

In [1]:
def bio_1(year):
    val = 'tmp'
    data = open_netcdf(get_filename(val))
    
    year_in_month = (year-1901)*12
    
    d = data.variables[val][year_in_month:year_in_month+12,:,:].data.mean(axis=0)
    d[d > 99999] = np.nan
    res = np.flip(d,0)
    res = np.around(res, decimals=2)
    return res

## Bio 2 / Mean Diurnal Range (Mean of monthly (max temp - min temp))

In [None]:
def bio_2(year):
    data_tmn = open_netcdf(get_filename('tmn'))
    data_tmx = open_netcdf(get_filename('tmx'))
    year_in_month = (year-1901)*12
    
    t_min = data_tmn.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    t_max = data_tmx.variables['tmx'][year_in_month:year_in_month+12,:,:].data
    t_min[t_min > 255] = np.nan
    t_max[t_max > 255] = np.nan
    d = t_max - t_min
    d = d.mean(axis=0)
    d[d > 99999] = np.nan
    res = np.flip(d,0)
    res = np.around(res, decimals=2)
    return res

## BIO 3 / Isothermality (BIO2/BIO7) (×100)

In [None]:
def bio_3(year):
    val = bio_2(year)/bio_7(year)
    val[val > 99999] = np.nan
    res = np.around(val, decimals=2)
    return res

## BIO 4 / Temperature Seasonality (standard deviation ×100)

In [None]:
def bio_4(year):
    data = open_netcdf(get_filename('tmp'))

    year_in_month = (year-1901)*12
    
    t_mp = data.variables['tmp'][year_in_month:year_in_month+12,:,:].data
    t_mp[t_mp > 99999] = np.nan
    res = np.std(t_mp, axis=0)
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 5 / Max Temperature of Warmest Month

In [None]:
def bio_5(year):
    data = open_netcdf(get_filename('tmx'))

    year_in_month = (year-1901)*12
    
    t_max = data.variables['tmx'][year_in_month:year_in_month+12,:,:].data
    t_max[t_max > 99999] = np.nan
    res = np.max(t_max, axis=0)
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 6 / Min Temperature of Coldest Month

In [None]:
def bio_6(year):
    data = open_netcdf(get_filename('tmn'))

    year_in_month = (year-1901)*12
    
    t_min = data.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    t_min[t_min > 99999] = np.nan
    res = np.min(t_min, axis=0)
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 7 / Temperature Annual Range (BIO5-BIO6)

In [None]:
def bio_7(year):
    rang_temp = bio_5(1980)-bio_6(1980)
    rang_temp[rang_temp > 99999] = np.nan
    res = np.around(rang_temp, decimals=2)
    return res

## BIO 8 / Mean Temperature of Wettest Quarter

In [None]:
def bio_8(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 9 / Mean Temperature of Driest Quarter

In [None]:
def bio_9(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO10 / Mean Temperature of Warmest Quarter

In [None]:
def bio_10(year):
    data_wet = open_netcdf(get_filename('tmx'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmx'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 11 / Mean Temperature of Coldest Quarter

In [None]:
def bio_11(year):
    data_wet = open_netcdf(get_filename('tmn'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 12 / Annual Precipitation

In [None]:
def bio_12(year):
    data_pre = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_pre.variables['pre'][year_in_month:year_in_month+12,:,:].data
    temp_wet[temp_wet > 99999] = np.nan
    temp_wet = np.sum(temp_wet, axis=0)
    res = np.flip(temp_wet,0)
    res = np.around(res, decimals=2)
    return res

## BIO 13 / Precipitation of Wettest Month

In [None]:
def bio_13(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    max_ind = np.argmax(temp_wet, axis=0)
        
    res = np.choose(max_ind, data_tmp.variables['pre'][year_in_month:year_in_month+12,:,:].data)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 14 / Precipitation of Driest Month

In [None]:
def bio_14(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    min_ind = np.argmin(temp_wet, axis=0)
        
    res = np.choose(min_ind, data_tmp.variables['pre'][year_in_month:year_in_month+12,:,:].data)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 15 / Precipitation Seasonality (Coefficient of Variation)

In [None]:
def bio_15(year):
    val = 'pre'
    data = open_netcdf(get_filename(val))
    
    year_in_month = (year-1901)*12
    
    d = data.variables[val][year_in_month:year_in_month+12,:,:].data
    d[d > 99999] = np.nan
    cv = lambda x: np.std(d, ddof=1, axis=0) / np.mean(d, axis=0) * 100
    d = cv(d)
    res = np.flip(d,0)
    res = np.around(res, decimals=2)
    return res

## BIO 16 / Precipitation of Wettest Quarter

In [None]:
def bio_16(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 17 / Precipitation of Driest Quarter

In [None]:
def bio_17(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 18 / Precipitation of Warmest Quarter...

In [None]:
def bio_18(year):
    data_wet = open_netcdf(get_filename('tmp'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmp'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO19 / Precipitation of Coldest Quarter

In [None]:
def bio_19(year):
    data_wet = open_netcdf(get_filename('tmn'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

# Bioclim - create and save files for each year

In [None]:
def create_and_save_bioclim():
    for i in range(120,121):
        st = 1901+i
        os.makedirs(f"../raw_data/bioclim/{st}")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_1.csv", 
                   bio_1(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_2.csv", 
                   bio_2(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_3.csv", 
                   bio_3(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_4.csv", 
                   bio_4(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_5.csv", 
                   bio_5(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_6.csv", 
                   bio_6(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_7.csv", 
                   bio_7(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_8.csv", 
                   bio_8(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_9.csv", 
                   bio_9(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_10.csv", 
                   bio_10(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_11.csv", 
                   bio_11(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_12.csv", 
                   bio_12(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_13.csv", 
                   bio_13(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_14.csv", 
                   bio_14(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_15.csv", 
                   bio_15(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_16.csv", 
                   bio_16(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_17.csv", 
                   bio_17(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_18.csv", 
                   bio_18(st), delimiter=",")
        np.savetxt(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{st}/bio_19.csv", 
                   bio_19(st), delimiter=",")

# Bioclim - read out all years and create big .npy file

In [None]:
def read_csv_store_npy():
    all_dat = []
    for j in range(1901,2022):
        print(j)
        path = f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim/{j}/"
        #fi_n = os.listdir(path)
        my_data = []
        for i in range(19):
            my_data.append(np.loadtxt(path+f"bio_{i+1}.csv", delimiter=","))
        all_dat.append(my_data)
        
    with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim_np/Bioclim.npy", 'wb') as f:
        np.save(f, np.array(all_dat))
    return np.array(all_dat)

# Bioclim - Hist Clim for all exising lon/lat vals

In [None]:
def get_all_climate_lon_lat():
    
    path = f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim_np/"
    filen = 'bioclim_upd.npy'
    bioclim = np.load(path+filen)
    
    li_hist_df = []
    for g in range(121):
        li_clim = []
        li_lon_lat_years = []
        li_lon_lat = []
        li_lon_lon = []
        for i in range(360):
            for j in range(720):
                li_temp = []
                for k in range(19):
                    li_temp.append(bioclim[g,k,i,j])
                li_lon_lat_years.append(f"{g}")
                li_lon_lat.append(f"{i}")
                li_lon_lon.append(f"{j}")
                li_clim.append(li_temp)
        aa = np.stack((np.array(li_lon_lat_years), np.array(li_lon_lat), np.array(li_lon_lon)))
        aa = np.transpose(aa)
        cc = np.array(li_clim)
        uu = np.hstack((aa, cc))
        xx = pd.DataFrame(uu)
        xx = xx[xx[3].str.contains("nan")==False]
        li_hist_df.append(xx.to_numpy())
    with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/world_history_bioclim_allLatLon_upd.npy", 'wb') as f:
        np.save(f, d)    
    return np.array(li_hist_df)

# Soilgrid - for all exising lon/lat vals

In [None]:
def get_all_climate_soil_grid_lon_lat():
    path = f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/bioclim_np/"
    filen = 'bioclim_upd.npy'
    bioclim = np.load(path+filen)
    data = np.load("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/soil_data.npy")
    
    li_clim = []
    li_lon_lat = []
    li_lon_lon = []
    for i in range(360):
        #print(i)
        for j in range(720):
            li_temp = []
            for k in range(19):
                li_temp.append(bioclim[0,k,i,j])
            for u in range(60):
                li_temp.append(data[k,i,j])
            
            li_lon_lat.append(f"{i}")
            li_lon_lon.append(f"{j}")
            li_clim.append(li_temp)
                
    aa = np.stack((np.array(li_lon_lat), np.array(li_lon_lon)))
    aa = np.transpose(aa)
    cc = np.array(li_clim)
    uu = np.hstack((aa, cc))
    xx = pd.DataFrame(uu)
    xx = xx[xx[3].str.contains("nan")==False]
    xx.columns = [np.arange(0,81)]
    xx = xx.drop(xx.columns[np.arange(2,21)],axis = 1)
    xx.columns = [np.arange(0,62)]
    with open("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/world_soilgrid_allLatLon_withzeros_upd.npy", 'wb') as f:
        np.save(f, xx)
    return xx

# Bioclim - Hist Clim and Soil Data for all exising lon/lat vals

In [None]:
def get_all_histclim_soilgrid_lon_lat():
    path = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/'
    file_c = 'world_history_bioclim_allLatLon_upd.npy'
    file_s = 'world_soilgrid_allLatLon_withzeros_upd.npy'
    cl_data = np.load(path+file_c, allow_pickle=True)
    sg_data = np.load(path+file_s, allow_pickle=True)

    np.concatenate((cl_data[0,:,:], sg_data), axis=1).shape

    lit = []
    for i in range(len(cl_data)):
        lit.append(np.concatenate((cl_data[i,:,:], sg_data[:,2:]), axis=1))
    lit = np.array(lit)

    with open("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/clim_hist_soil_overall.npy", 'wb') as f:
        np.save(f, lit)

# Plants - Retrieve Bioclim and Soilgrid Data for plants

In [None]:
def get_bioclim_from_plants_from_overall_file():
    
    df = pd.read_csv("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/data_clean_5_continent_Animal_plants.csv", on_bad_lines='skip')
    df = df[(df['year']>=1901) & (df['year']<=2021)]
    df.reset_index(inplace=True)
    
    path = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/'
    filen = 'clim_hist_soil_overall.npy'
    ov_data = np.load(path+filen, allow_pickle=True)
    
    ## column names bioclim
    b_col =['b1', 'b2', 'b3', 'b4', 'b5','b6','b7','b8','b9','b10','b11','b12','b13','b14','b15','b16','b17','b18','b19']
    
    ## column names soilgrid
    numb = (np.arange(1,61))
    s = ('sg')
    sg_col =[]
    for item in numb:
        sg_col.append(s+str(item))
        
    coln = b_col + sg_col
    
    df_feat = pd.DataFrame(columns=[coln], index = [np.arange(0,len(df))])
    
    for i in range(len(df)):  
        lat = df.loc[i, 'decimalLatitude']
        lon = df.loc[i, 'decimalLongitude']
        year = int(df.loc[i, 'year'])
        
        c_lat = int(round(np.interp(lat, [-90, 90], [360, 0])))  # check order of values here
        c_lon = int(round(np.interp(lon, [-180, 180], [1, 719])))

        temp_df = pd.DataFrame(ov_data[year-1901, :, :])
        temp_df = temp_df[(temp_df[1]==str(c_lat)) & (temp_df[2]==str(c_lon))].iloc[:,3:]
        temp_df.columns = coln
        if not temp_df.empty:
            temp_df = temp_df.set_index([[i]])
            df_feat.iloc[i,:]=temp_df
        else:
            continue
        
    #df.dropna(inplace=True)
    df_feat.to_csv("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/plants_bioclim_soil_upd.csv")
    return df_feat

# Future Clim - Retrieve Bioclim and Soilgrid Data for plants

In [None]:
def get_future_clim():
    latr = np.arange(-90,90, 0.5)
    lonr = np.arange(-180,180, 0.5)

    fi_n = sorted(os.listdir(fold_n))
    fi_n.remove('.DS_Store')
    fold_n = ('/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/WorldClim_future/')

    overall= []
    for fn in fi_n:
        print(fn)
        src = rasterio.open(fold_n+fn)
        all_bio = []

        lili = []
        for i in range(360):
            print(i)
            for j in range(720):
                for val in src.sample([(lonr[j], latr[i])]): 
                    lili.append(val)

        a = np.array(lili)
        b = a.transpose(1,0)
        dada = []
        for i in range(19):
            z = b[i].reshape(360,720)
            z = np.flip(z, axis=0)
            dada.append(z)
        overall.append(np.array(dada))
    with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/WorldClim_future_np/overall_future_clim.npy", 'wb') as f:
        np.save(f, np.array(overall))

In [None]:
def get_all_climate_lon_lat():
    
    data_fc = np.load("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/WorldClim_future_np/overall_future_clim.npy")
    
    li_hist_df = []
    for g in range(16):
        li_clim = []
        li_lon_lat_years = []
        li_lon_lat = []
        li_lon_lon = []
        for i in range(360):
            for j in range(720):
                li_temp = []
                for k in range(19):
                    li_temp.append(data_fc[g,k,i,j])
                li_lon_lat_years.append(f"{g}")
                li_lon_lat.append(f"{i}")
                li_lon_lon.append(f"{j}")
                li_clim.append(li_temp)
        aa = np.stack((np.array(li_lon_lat_years), np.array(li_lon_lat), np.array(li_lon_lon)))
        aa = np.transpose(aa)
        cc = np.array(li_clim)
        uu = np.hstack((aa, cc))
        xx = pd.DataFrame(uu)
        xx = xx[xx[3].str.contains("nan")==False]
        li_hist_df.append(xx.to_numpy())
        
    with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/overall_future_clim_allLatLon_upd.npy", 'wb') as f:
        np.save(f, d)
    return np.array(li_hist_df)

In [None]:
def merge_future_clim_with_soil():
    path = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/'
    file_cl = 'overall_future_clim_allLatLon_upd.npy'
    file_s = 'world_soilgrid_allLatLon_withzeros_upd.npy'
    cl_fut = np.load(path+file_cl, allow_pickle=True)
    sg_data = np.load(path+file_s, allow_pickle=True)
    
    ## column names soilgrid
    numb = (np.arange(1,61))
    s = ('sg')
    sg_col =[]
    for item in numb:
        sg_col.append(s+str(item))
        
    sdf = pd.DataFrame(sg_data)
    sdf = sdf.astype(float)
    sg = [0,1]
    col = sg + sg_col
    sdf.columns=col
    sdf = sdf.astype({0:'int', 1:'int'})
    
    li_df = []
    for i in range(len(cl_fut)):
        ## column names bioclim
        b_col =['b1', 'b2', 'b3', 'b4', 'b5','b6','b7','b8','b9','b10','b11','b12','b13','b14','b15','b16','b17','b18','b19']
        col2 = sg + b_col
        cldf = pd.DataFrame(cl_fut[i])
        cldf = cldf.astype(float)
        cldf = cldf.drop(columns=0)
        cldf.columns = col2
        cldf = cldf.astype({0:'int', 1:'int'})

        ## merge df
        new_df = pd.merge(cldf, sdf,  how='left', left_on=[0,1], right_on = [0,1])
        new_df = new_df.dropna()
        li_df.append(new_df.to_numpy())
        
    with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/future_clim_soilgr.npy", 'wb') as f:
        np.save(f, np.array(li_df))
    return np.array(li_df)

# Image scraping

In [None]:
def get_urls_to_scrap():
    ## get plant - multimedia data
    df_m = pd.read_csv('/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/multimedia.txt', sep='\t', index_col='gbifID')
    df_m = df_m[['identifier']]
    ## get plant - info
    df_i = pd.read_csv("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/data_inkl_bioclim_grs.csv")
    df_i = df.set_index('gbifID')
    media_df = df_i.merge(df_m,  left_index=True, right_index=True)

In [None]:
def img_scraping():
    ## foldername for the thumbnails
    fold_n_thumb = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/thumbnails/'
    fi_n = os.listdir(fold_n_thumb)
    files = [os.path.splitext(filename)[0] for filename in os.listdir(fold_n_thumb)]
    #fi_n.remove('.DS_Store') ## in case you have a mac - this might be necessary
    
    ## read out the already existing files in the folder...
    media_df_nodupl = media_df[~media_df.index.duplicated(keep='first')]
    list_indexes = media_df_nodupl.index
    diff_li = list(set(list_indexes) - set(files))
    
    ## shuffle the list - this might be necessary since requests at the same api in a row might deny access
    random.shuffle(diff_li)

    ## request and store images in "images" folder
    for i in range(0, 10000):
        try:
            r = requests.get(media_df_nodupl.loc[int(diff_li[i]),'identifier'], stream=True) #Get request on full_url
            with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/images/{int(diff_li[i])}.jpg", 'wb') as f: 
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
        except:
            continue

        time.sleep(random.uniform(0.5, 1))

In [None]:
def img_croping_and_scaling():
    ## get images from images folde
    fold_n_im = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/images/'
    fi_n = os.listdir(fold_n_im)
    fi_n.remove('.DS_Store')

    ## crop and scale images to thumnails folder
    for i in range(len(fi_n)):
        try:
            image = Image.open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/images/{fi_n[i]}")
            image = crop_max_square(image).resize((300, 300))
            image.save(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/thumbnails/{fi_n[i]}") 
        except:
            continue

In [None]:
def crop_max_square(pil_img):
    return crop_center(pil_img, min(pil_img.size), min(pil_img.size))

In [None]:
def crop_center(pil_img, crop_width, crop_height):
    img_width, img_height = pil_img.size
    return pil_img.crop(((img_width - crop_width) // 2,
                         (img_height - crop_height) // 2,
                         (img_width + crop_width) // 2,
                         (img_height + crop_height) // 2))