# Adding Variance to $\lambda$ Values

This notebook explores the impact of letting $\lambda$ values be sampled from a normal distribution instead of just using a constant value.



In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from farm.climate import make_climate_parameters, check_exponential, Climate

In [3]:
#%% Climate Class Definition
from math import exp
import numpy as np
import pandas as pd
from numpy.random import exponential, uniform
from dateutil.relativedelta import *
import scipy.stats as st

default_climate = {
    'alpha_r': [10.0] * 12,
    'lambda_r': [0.25] * 12,
    'ET_max': 6.5
}

from datetime import timedelta, datetime

datetimes = np.arange(
    datetime(2018,1,1), datetime(2019,1,1), timedelta(days=1)
    ).astype(datetime)

month_value_by_day = np.array([datetime.month for datetime in datetimes])
week_value_by_day = np.array([datetime.isocalendar()[1] for datetime in datetimes])
dekad_value_by_day = np.array([(datetime_.timetuple().tm_yday - 1)//10+1 for datetime_ in datetimes])
# TODO Add semi_month_value_by_day if using 

# def make_climate_parameters(
station='OL JOGI FARM'
data_file="../data/CETRAD/CETRAD_rainfall.csv"
year_min=30
interval='dekad'
do_std=True

""" Defines function that takes a rainfall station time series and returns alpha and lambda values by 
a certain interval between week (7-days), dekad (10-days), semi-monthly (twice per month) or monthly.

    Usage:

        make_climate_parameters(
            station='OL JOGI FARM', 
            data_file="data/CETRAD/CETRAD_rainfall.csv",
            year_min= 30,
            interval='dekad' 
        )

    Default values:
        station = 'OL JOGI FARM' [string] # Rainfall Climatology for Laikipia 
        data_file = "data/CETRAD/CETRAD_rainfall.csv" # Path to file
        year_min = 30 # Minimum number of years required in timeseries
        interval = 'dekad' # Interval to calculate alphas andlambdas

            returns alpha_values, lambda_values
"""
# Prepare the CETRAD dataset.
year_min = year_min # minimum number of years to consider for a valid climate record.

df = pd.read_csv(data_file)  # Read in the raw csv data.

# Step 1. Convert text strings into datetime objects.
format = '%m/%d/%y' # Column RDate has data in M/D/YY
df['Datetime']=pd.to_datetime(df['RDate'], format=format) # Create a new column of datetime objects using RDate.

# 2. Step 2. Convert future dates inferred during the conversion back into 20th century dates.
# Python is a future-looking programming language, and assumes that 1/1/34 is Jan 1, 2034.
# We can fix this by finding all the dates in the future (dt > datetime.now()) and removing 100 years from
# their value. This requires using the relativedelta function, which handles weird stuff like leap years.
df['Datetime'] = df['Datetime'].map(lambda dt: dt+relativedelta(years=-100) if dt > datetime.now() else dt)

# Step 3. Extract the Year and Month from the Datetime to make aggregation easier.
df['Year'] = [dt.year for dt in df['Datetime']]
df['Month'] = [dt.month for dt in df['Datetime']]
df['Week'] = [dt.week for dt in df['Datetime']]
df['Semi_Month'] = (df['Datetime'].dt.day
                      .gt((df['Datetime']+pd.tseries.offsets.MonthEnd()).dt.day//2) 
                      + df['Month']*2 -1)
df['Dekad'] = df['Datetime'].dt.dayofyear//10+1

n_years = len(df['Year'].unique())

# Check to make sure we have enough data for fitting and parameter estimation.
if n_years < year_min:
    print("WARNING! Station record for {station} has only {n_years} years.".format(
        station=station,
        n_years=n_years))

# Step 4. Use the Datetime values as the index for this dataframe.
df = df.set_index(pd.DatetimeIndex(df['Datetime']))  # Set the Datetime column as the dataframe index

# Step 5.  Delete the old RDate column, which we no longer need. 
# We will keep the Datetime column, in case we need it later.
df = df.drop(['RDate'], axis=1)

columns = [station] + ['Year', 'Month', 'Week', 'Dekad', 'Semi_Month','Datetime']
rainfall = df[columns]

# First, find all the rows in the data where it rained and group by month.
rain_days = rainfall.loc[rainfall[station] > 0]

# Find all locations in the data where an observation was made.
all_days = rainfall.loc[rainfall[station] >= 0] 

# Find just the rainfall amounts on days that it rained.
data = rainfall.loc[rainfall[station] > 0][station]

# Fit the daily rainfall amounts to an exponential distribution.
check_exponential(data)

if interval == 'month':
    # Determine the Monthly values of alpha and lambda from the station data:
    s = pd.DataFrame(
        rain_days.groupby(['Month', 'Year'])[station].count().unstack(fill_value=0).stack() /
        all_days.groupby(['Month', 'Year'])[station].count()
    )[0]
    df = pd.DataFrame(s).reset_index()
    df.columns = ['Month', 'Year', 'Value']
    avg_lambda_values = df.groupby('Month')['Value'].mean()
    if do_std:
        std_lambda_values = df.groupby('Month')['Value'].std()
    else:
        std_lambda_values = avg_lambda_values * 0
    alpha_values = rain_days.groupby('Month')[station].mean()
elif interval == 'dekad':
    s = pd.DataFrame(
        rain_days.groupby(['Dekad', 'Year'])[station].count().unstack(fill_value=0).stack() / 
        all_days.groupby(['Dekad', 'Year'])[station].count()
    )[0]
    df = pd.DataFrame(s).reset_index()
    df.columns = ['Dekad', 'Year', 'Value']
    avg_lambda_values = df.groupby('Dekad')['Value'].mean()
    if do_std:
        std_lambda_values = df.groupby('Dekad')['Value'].std()
    else:
        std_lambda_values = avg_lambda_values * 0
    alpha_values = rain_days.groupby('Dekad')[station].mean()
elif interval == 'semi_month':
    s = pd.DataFrame(
        rain_days.groupby(['Semi_Month', 'Year'])[station].count().unstack(fill_value=0).stack() / 
        all_days.groupby(['Semi_Month', 'Year'])[station].count()
    )[0]
    df = pd.DataFrame(s).reset_index()
    df.columns = ['Semi_Month', 'Year', 'Value']
    avg_lambda_values = df.groupby['Dekad']['Value'].mean()
    if do_std:
        std_lambda_values = df.groupby['Dekad']['Value'].std()
    else:
        std_lambda_values = avg_lambda_values * 0
else:
    raise(NotImplementedError)
    
print(avg_lambda_values.to_list(), std_lambda_values.to_list())

[0.057239057239057235, 0.06666666666666667, 0.03636363636363636, 0.04545454545454546, 0.06060606060606061, 0.045454545454545456, 0.06363636363636364, 0.07878787878787881, 0.1696969696969697, 0.2, 0.26562499999999994, 0.32812500000000006, 0.35625, 0.384375, 0.20312499999999994, 0.140625, 0.140625, 0.14062499999999997, 0.15625, 0.17187499999999994, 0.190625, 0.15312499999999998, 0.14062499999999997, 0.13750000000000004, 0.15625, 0.11562500000000002, 0.11562500000000003, 0.10312500000000002, 0.1774193548387097, 0.264516129032258, 0.27419354838709675, 0.2806451612903226, 0.21612903225806449, 0.17741935483870963, 0.11290322580645161, 0.08064516129032258, 0.04608294930875576] [0.1499002037396287, 0.13385315336840842, 0.06990252954195317, 0.0904534033733291, 0.10289373747765805, 0.11205720941473683, 0.13420642174040567, 0.12439246298906075, 0.18789100644530674, 0.21937410968480306, 0.1993689641618468, 0.24261262765316685, 0.21543279769473506, 0.20652169299475384, 0.15551034609290626, 0.177544

In [78]:
np.random.normal(scale=0.1) + 1

1.1082427469092115