# Import

In [None]:
import gzip
import os
import shutil
import tempfile
import netCDF4
from netCDF4 import Dataset
import numpy as np
import pandas as pd
import math
import requests
import time
import random
from PIL import Image, ImageDraw, ImageFilter
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

In [None]:
val_nam_list = ['cld', 'dtr', 'frs', 'pet', 'pre', 'tmn', 'tmp', 'tmx', 'vap', 'wet']
fold_n = '/Users/davidschildberger/03_LeWagon_Datasets/CRU_raw_data_2021/'

In [None]:
def get_filename(feat_name):
    fi_n = os.listdir(fold_n)
    fi_n.remove('.DS_Store')
    for i in fi_n: 
        if(feat_name in i) : 
            return i

In [None]:
def open_netcdf(fname):
    with gzip.open((fold_n+fname), 'rb') as f:
        tmp = tempfile.NamedTemporaryFile(delete=False)
        shutil.copyfileobj(f, tmp)

        f.close()
        tmp.close()
        
        data = netCDF4.Dataset(tmp.name)
        os.unlink(tmp.name)
    return data

# Bioclim - variables - calculation - functions

## Bio 1 / Annual Mean Temperature

In [1]:
def bio_1(year):
    val = 'tmp'
    data = open_netcdf(get_filename(val))
    
    year_in_month = (year-1901)*12
    
    d = data.variables[val][year_in_month:year_in_month+12,:,:].data.mean(axis=0)
    d[d > 99999] = np.nan
    res = np.flip(d,0)
    res = np.around(res, decimals=2)
    return res

## Bio 2 / Mean Diurnal Range (Mean of monthly (max temp - min temp))

In [None]:
def bio_2(year):
    data_tmn = open_netcdf(get_filename('tmn'))
    data_tmx = open_netcdf(get_filename('tmx'))
    year_in_month = (year-1901)*12
    
    t_min = data_tmn.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    t_max = data_tmx.variables['tmx'][year_in_month:year_in_month+12,:,:].data
    t_min[t_min > 255] = np.nan
    t_max[t_max > 255] = np.nan
    d = t_max - t_min
    d = d.mean(axis=0)
    d[d > 99999] = np.nan
    res = np.flip(d,0)
    res = np.around(res, decimals=2)
    return res

## BIO 3 / Isothermality (BIO2/BIO7) (×100)

In [None]:
def bio_3(year):
    val = bio_2(year)/bio_7(year)
    val[val > 99999] = np.nan
    res = np.around(val, decimals=2)
    return res

## BIO 4 / Temperature Seasonality (standard deviation ×100)

In [None]:
def bio_4(year):
    data = open_netcdf(get_filename('tmp'))

    year_in_month = (year-1901)*12
    
    t_mp = data.variables['tmp'][year_in_month:year_in_month+12,:,:].data
    t_mp[t_mp > 99999] = np.nan
    res = np.std(t_mp, axis=0)
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 5 / Max Temperature of Warmest Month

In [None]:
def bio_5(year):
    data = open_netcdf(get_filename('tmx'))

    year_in_month = (year-1901)*12
    
    t_max = data.variables['tmx'][year_in_month:year_in_month+12,:,:].data
    t_max[t_max > 99999] = np.nan
    res = np.max(t_max, axis=0)
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 6 / Min Temperature of Coldest Month

In [None]:
def bio_6(year):
    data = open_netcdf(get_filename('tmn'))

    year_in_month = (year-1901)*12
    
    t_min = data.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    t_min[t_min > 99999] = np.nan
    res = np.min(t_min, axis=0)
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 7 / Temperature Annual Range (BIO5-BIO6)

In [None]:
def bio_7(year):
    rang_temp = bio_5(1980)-bio_6(1980)
    rang_temp[rang_temp > 99999] = np.nan
    res = np.around(rang_temp, decimals=2)
    return res

## BIO 8 / Mean Temperature of Wettest Quarter

In [None]:
def bio_8(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 9 / Mean Temperature of Driest Quarter

In [None]:
def bio_9(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO10 / Mean Temperature of Warmest Quarter

In [None]:
def bio_10(year):
    data_wet = open_netcdf(get_filename('tmx'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmx'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 11 / Mean Temperature of Coldest Quarter

In [None]:
def bio_11(year):
    data_wet = open_netcdf(get_filename('tmn'))
    data_tmp = open_netcdf(get_filename('tmp'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.mean(data_tmp.variables['tmp'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 12 / Annual Precipitation

In [None]:
def bio_12(year):
    data_pre = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_pre.variables['pre'][year_in_month:year_in_month+12,:,:].data
    temp_wet[temp_wet > 99999] = np.nan
    temp_wet = np.sum(temp_wet, axis=0)
    res = np.flip(temp_wet,0)
    res = np.around(res, decimals=2)
    return res

## BIO 13 / Precipitation of Wettest Month

In [None]:
def bio_13(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    max_ind = np.argmax(temp_wet, axis=0)
        
    res = np.choose(max_ind, data_tmp.variables['pre'][year_in_month:year_in_month+12,:,:].data)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 14 / Precipitation of Driest Month

In [None]:
def bio_14(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    min_ind = np.argmin(temp_wet, axis=0)
        
    res = np.choose(min_ind, data_tmp.variables['pre'][year_in_month:year_in_month+12,:,:].data)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 15 / Precipitation Seasonality (Coefficient of Variation)

In [None]:
def bio_15(year):
    val = 'pre'
    data = open_netcdf(get_filename(val))
    
    year_in_month = (year-1901)*12
    
    d = data.variables[val][year_in_month:year_in_month+12,:,:].data
    d[d > 99999] = np.nan
    cv = lambda x: np.std(d, ddof=1, axis=0) / np.mean(d, axis=0) * 100
    d = cv(d)
    res = np.flip(d,0)
    res = np.around(res, decimals=2)
    return res

## BIO 16 / Precipitation of Wettest Quarter

In [None]:
def bio_16(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 17 / Precipitation of Driest Quarter

In [None]:
def bio_17(year):
    data_wet = open_netcdf(get_filename('wet'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['wet'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO 18 / Precipitation of Warmest Quarter...

In [None]:
def bio_18(year):
    data_wet = open_netcdf(get_filename('tmp'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmp'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    max_ind = np.argmax(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(max_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

## BIO19 / Precipitation of Coldest Quarter

In [None]:
def bio_19(year):
    data_wet = open_netcdf(get_filename('tmn'))
    data_tmp = open_netcdf(get_filename('pre'))
    
    year_in_month = (year-1901)*12
    
    temp_wet =  data_wet.variables['tmn'][year_in_month:year_in_month+12,:,:].data
    
    li_temp = []
    for i in range(0,11,3):
        li_temp.append(np.sum(temp_wet[i:i+3], axis=0))
    li_temp = np.array(li_temp)
    min_ind = np.argmin(li_temp, axis=0)
    
    li_q_t = []
    for i in range(0,11,3):
        li_q_t.append(np.sum(data_tmp.variables['pre'][year_in_month+i:year_in_month+i+3,:,:].data, axis=0))

    li_q_t = np.array(li_q_t)
    
    res = np.choose(min_ind, li_q_t)
    res[res > 99999] = np.nan
    res = np.flip(res,0)
    res = np.around(res, decimals=2)
    return res

# Image scraping

In [None]:
def get_urls_to_scrap():
    ## get plant - multimedia data
    df_m = pd.read_csv('/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/multimedia.txt', sep='\t', index_col='gbifID')
    df_m = df_m[['identifier']]
    ## get plant - info
    df_i = pd.read_csv("/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/data_inkl_bioclim_grs.csv")
    df_i = df.set_index('gbifID')
    media_df = df_i.merge(df_m,  left_index=True, right_index=True)

In [None]:
def img_scraping():
    ## foldername for the thumbnails
    fold_n_thumb = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/thumbnails/'
    fi_n = os.listdir(fold_n_thumb)
    files = [os.path.splitext(filename)[0] for filename in os.listdir(fold_n_thumb)]
    #fi_n.remove('.DS_Store') ## in case you have a mac - this might be necessary
    
    ## read out the already existing files in the folder...
    media_df_nodupl = media_df[~media_df.index.duplicated(keep='first')]
    list_indexes = media_df_nodupl.index
    diff_li = list(set(list_indexes) - set(files))
    
    ## shuffle the list - this might be necessary since requests at the same api in a row might deny access
    random.shuffle(diff_li)

    ## request and store images in "images" folder
    for i in range(0, 10000):
        try:
            r = requests.get(media_df_nodupl.loc[int(diff_li[i]),'identifier'], stream=True) #Get request on full_url
            with open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/images/{int(diff_li[i])}.jpg", 'wb') as f: 
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
        except:
            continue

        time.sleep(random.uniform(0.5, 1))

In [None]:
def img_croping_and_scaling():
    ## get images from images folde
    fold_n_im = '/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/images/'
    fi_n = os.listdir(fold_n_im)
    fi_n.remove('.DS_Store')

    ## crop and scale images to thumnails folder
    for i in range(len(fi_n)):
        try:
            image = Image.open(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/images/{fi_n[i]}")
            image = crop_max_square(image).resize((300, 300))
            image.save(f"/Users/davidschildberger/code/dadavie/planetary_garden/raw_data/thumbnails/{fi_n[i]}") 
        except:
            continue

In [None]:
def crop_max_square(pil_img):
    return crop_center(pil_img, min(pil_img.size), min(pil_img.size))

In [None]:
def crop_center(pil_img, crop_width, crop_height):
    img_width, img_height = pil_img.size
    return pil_img.crop(((img_width - crop_width) // 2,
                         (img_height - crop_height) // 2,
                         (img_width + crop_width) // 2,
                         (img_height + crop_height) // 2))