# Field Capacity Finder from the soil moisture

`[Run All] is mendatory`

## From the soil moisture data
***
### Data description
- data is being collected from the F70 field.
- soil moisture is in unit of %, **not cleaned**
- this script includes cleaning function
- this script designed to be used in daily basis.
- from the soil moisture monitoring website
- example data is 'Calc_def_test.csv'

- [soil moisture data](https://things.iot.ag.purdue.edu:8080/dashboard/dc56c5a0-ee3e-11ec-b72b-5dd76ca52a2b?publicId=a914a590-ecae-11ec-b72b-5dd76ca52a2b) input data for the script will be downloaded directly
> - soil moisture data is from Purdue AgIT server that collects the field data through LoRaWan network
> - the data should be downloaded from the 'Summary Data Table' tab
> 
- [weather data] input data for the script will be downloaded directly
> - weather data is from Purdue Mesonet server and using ACRE station data
> 
For downloading the data
  - Summary data table
  - from date1 to date2
  - in csv form
  - file name does not matter or add today's date at the end of the file name.
  

### Version history
***
#### Ver 1.1
- ver 1.1 branch made
- Shared with co-author Dr.Chandra
- Added Thingsboard API to the program
- deficit report export function to [./Deficit_results/] (2024-05-16)

In [26]:
# // this part is download the data from the website for 2 month.
import requests
import sys
import os
import json
import pandas as pd
from pprint import pprint
import datetime
import pytz
import config
from dateutil.relativedelta import relativedelta

deviceList = []

# ** set the configuration for the request                                                                                  **
config = {
 'username' : 'yang2309@purdue.edu', ### Insert your email address used by AgIT Thingsboard system
 'password': 'dsya2002',  ### Insert your AgIT thingsboard password
 'server' : 'https://things.iot.ag.purdue.edu:8080'
}

# ** defining the function to get the token for the request and setting the header for the request                          **
def getCustomerDevices(custID, textSearch=None):
    parameters = {        
        'pageSize': 1000,
        'page': 0,                
    }
    att_parms = {
        'keys': 'dev_eui'
    }
    if(textSearch):
        parameters.update({'textSearch': textSearch})
    responseList = requests.get(f"{config['server']}/api/customer/{custID}/devices", headers=TBheaders,params= parameters).json()
    #pprint(responseList)
    list = []
    for dev in responseList['data']:
        #pprint(dev)
        #print('------------------------------------------------------------------------------------------')
        #'id': {'entityType': 'DEVICE', 'id': 'd49153a0-c868-11eb-95d8-09d06ef6a9a5'},
        url = f"{config['server']}/api/plugins/telemetry/DEVICE/{dev['id']['id']}/values/attributes"
        deviceResp = requests.get(url, headers=TBheaders,params= att_parms).json()
        #print('------------------------------------------------------------------------------------------')
        list.append([dev['id']['id'],dev['name'],deviceResp[0]['value']])
    return list
        

def login(url, username, password):
    # Log into ThingsBoard
    return requests.post(f"{url}/api/auth/login", json={
        "username": username,
        "password": password
    }).json()['token']

def get_keys(device):
    return requests.get(f"{config['server']}/api/plugins/telemetry/DEVICE/{device}/keys/timeseries",
                 headers=TBheaders).json()
def get_data_chunk(url, token, device, key, start, stop, limit):
    #print([url, device, key, start, stop, limit])
    return requests.get(f"{url}/api/plugins/telemetry/DEVICE/{device}/values/timeseries",
             headers=TBheaders,
            params= {
                'keys': key,
                'startTs': start,
                'endTs': stop,
                'limit': limit,
                'agg': 'NONE'
            }).json()

def get_data(url, token, device, key, start, stop):
    global totalLength
    p = pd.DataFrame()
    
    # You have to request data backwards in time ...
    while start < stop:
        data = get_data_chunk(url, token, device[0], key, start, stop, 100000)
        #print(data)
        if key not in data:
            break;
        
        #print(f"{key}: Loaded {len(data[key])} points")
        t = pd.DataFrame.from_records(data[key])
        #t['Timestamp'] = t['ts']
        #pprint(t['ts'])
        t['ts'] = (pd.to_datetime(t['ts'],unit='ms'))        
        t.set_index('ts', inplace=True)
        
        t.rename(columns={'value': key}, inplace=True)
        p = p._append(t)

        # Update "new" stop time
        stop = data[key][-1]['ts'] - 1
    totalLength += len(p)
    #print(f"Total Length: {totalLength}")
    return p

def outputCSV(devices):
    global totalLength
    final_df = pd.DataFrame()
    for device in devices:
        #print(f"Downloading DEVICE: {device[0]} data");
        #print(device)
        p = pd.DataFrame()
        for key in keys:
            #print(f"info: Pulling {key}...");
            tempin = get_data(config['server'], token, device, key, startTS, endTS)            
            if(len(tempin)>0):                
                p = pd.concat([p,tempin], axis=1)
        p['Entity Name'] = device[1]
        p['dev_eui'] = device[2]
        p.reset_index(drop=False)
        #p_new_index = p.assign(**{'Timestamp': p.index})        
        if(len(p)):
            final_df = pd.concat([final_df,p])
        
    # Create Time Strings
    # Convert to nanoseconds for pandas.to_datetime
    start_timestamp_ns = startTS * 1000000
    end_timestamp_ns = endTS * 1000000
    
    # Convert timestamp to datetime object
    start_dt = pd.to_datetime(start_timestamp_ns, unit='ns')
    end_dt = pd.to_datetime(end_timestamp_ns, unit='ns')
    
    # Format datetime string as yyyy-mm-dd-HH-MM
    start_formatted_string = start_dt.strftime('%Y-%m-%d-%H-%M')
    end_formatted_string = end_dt.strftime('%Y-%m-%d-%H-%M')
    df_order = ["Entity Name","data_soil_moisture1","data_soil_moisture2","data_soil_moisture3","data_soil_moisture4","data_tem1","data_tem2","data_tem3","data_tem4","data_tem5","data_tem6","data_tem7","dev_eui"]
    final_df = final_df.reindex(columns=df_order)
    final_df1 = final_df.sort_values(by='ts')
    
    # Get current time
    now = datetime.datetime.now()
    
    # Format time string (hours and minutes)
    formatted_time = now.strftime("%H-%M")
    final_df1.to_csv(f"./Raw_data/data-{end_formatted_string}.csv")
    print("File Export Done.")

def getDeviceCredentialsByDeviceId(deviceID = 0):
    url = config['server']+'/device/'+deviceID+'/credentials'
    resp = requests.get(url,headers=TBheaders)
    responseList = resp.json()
    #pprint(responseList)
    return responseList['credentialsID']

def getDeviceServerAttributes(deviceID = 0):
    if deviceID == 0:
        while(deviceID == 0):
            try:
                deviceID = input("Enter device ID: ")
            except:
                print("Invalid DeviceID")
    url = config['server']+'/plugins/telemetry/DEVICE/'+deviceID+'/values/attributes'
    #pprint(url)
    #pprint(TBheaders)
    xresp = requests.get(url,headers=TBheaders)
    #pprint(xresp)
    #pprint(resp.content())
    #print(xresp.text())
    responseList = xresp.json()
    #pprint(responseList)
    #return responseList['credentialsID']


# ** getting token for the request                                                                                         **
print("Server: ",config['server'])
token = login(config['server'], config['username'], config['password']);
print(f"Token: {token}")
TBheaders={ 'Accept': '*/*', 'X-Authorization': f"Bearer {token}" }



# Create a datetime object representing the local date and time
# Year, Month, Day, Hour, Minute
today_dt = datetime.datetime.now()
start = datetime.datetime.now()+ relativedelta(months=-2)

start_dt = datetime.datetime(start.year, start.month, start.day, 18, 0)
end_dt = datetime.datetime(today_dt.year, today_dt.month, today_dt.day, 6, 00)
print (start_dt, end_dt)

# Convert to a specific time zone (e.g., UTC)
start_tz_utc = pytz.timezone("UTC")
start_dt_utc = start_tz_utc.localize(start_dt)
end_tz_utc = pytz.timezone("UTC")
end_dt_utc = end_tz_utc.localize(end_dt)

# Extract the Unix timestamp
startTS = int(start_dt_utc.timestamp())*1000
endTS = int(end_dt_utc.timestamp())*1000

# Use for relative time frames
#startTS = int((datetime.now() - timedelta(days=30)  - datetime(1970, 1, 1)).total_seconds() * 1000) # 30 days ago
#endTS = int((datetime.datetime.utcnow() - datetime.datetime(1970, 1, 1)).total_seconds() * 1000) # now

# print(startTS, endTS)



# ** customer ID for the request                                                                                            **
# getCustomerDevices(custID, textSearch=None):
# 7576b020-ecae-11ec-b72b-5dd76ca52a2b = Cherkhauer Customer ID
# ABE-DRAGINO-GROPOINT-CHERKHAUER = Devices with names beginning with "ABE-DRAGINO-GROPOINT-CHERKHAUER"
devices = getCustomerDevices("7576b020-ecae-11ec-b72b-5dd76ca52a2b","ABE-DRAGINO-GROPOINT-CHERKHAUER-ACRE")
# pprint(devices)

totalLength = 0
# keys to retrieve
#keys = ["data_TempC_SHT","data_Hum_SHT"]
#keys = ["data_ambient_temperature","data_input1_frequency","data_input1_frequency_to_moisture","data_Input2_voltage","data_Input2_voltage_to_temp","data_light_intensity","data_relative_humidity"]
keys = ["data_soil_moisture1","data_soil_moisture2","data_soil_moisture3","data_soil_moisture4","data_tem1","data_tem2","data_tem3","data_tem4","data_tem5","data_tem6","data_tem7"]

outputCSV(devices)

Server:  https://things.iot.ag.purdue.edu:8080
Token: eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiJ5YW5nMjMwOUBwdXJkdWUuZWR1IiwidXNlcklkIjoiNjRlOWZjYjAtZjc0ZS0xMWVlLWIzYmMtN2ZlNjliZjhkNDExIiwic2NvcGVzIjpbIkNVU1RPTUVSX1VTRVIiXSwic2Vzc2lvbklkIjoiOGExZDEwODctMjg5MS00MThjLTlmODQtZGY4N2NkYzMzOGQ5IiwiaXNzIjoidGhpbmdzYm9hcmQuaW8iLCJpYXQiOjE3MTU4OTA5MDgsImV4cCI6MTcxNTg5OTkwOCwiZmlyc3ROYW1lIjoiRG9uZ3Nlb2siLCJsYXN0TmFtZSI6IllhbmciLCJlbmFibGVkIjp0cnVlLCJpc1B1YmxpYyI6ZmFsc2UsInRlbmFudElkIjoiYWFjNjU1YTAtYWM2Mi0xMWVjLWFiYzgtMWYxYzA5NTgwZTY3IiwiY3VzdG9tZXJJZCI6Ijc1NzZiMDIwLWVjYWUtMTFlYy1iNzJiLTVkZDc2Y2E1MmEyYiJ9.NsitAfY_8lwcwQ7LTHS_0BcPTnvIjMHoY1fMgY_zuWzLMnHzWbn9VOFV9qoVzJhExZyUOPC82oWL0Juhc-dQmA
2024-03-16 18:00:00 2024-05-16 06:00:00
File Export Done.


# Initial data reading process
***
## readraw_data Function

The `readraw_data` function is used to read raw data from a CSV file and parse it into a pandas DataFrame.

### Parameters

- `destination`: The path where the output file will be saved.
- `filename`: The name of the CSV file to be read.
- `foutname`: The name of the output file.
- `st_date`: The start date for the data to be read.
- `ed_date`: The end date for the data to be read.

### Returns

- `raw_data`: A pandas DataFrame that contains the data read from the CSV file.

### Functionality

The function works by using the pandas `read_csv` function to read the CSV file. It specifies the delimiter as ';' and parses the 'Timestamp' column as dates. It also specifies the data types for the soil moisture columns to be float64.

The function then returns the DataFrame.
***
## station_data_clean Function

The `station_data_clean` function is used to clean the data for a specific station.

### Parameters

- `raw_data`: A pandas DataFrame that contains the raw data to be cleaned.
- `station`: An integer that represents the station ID.

### Returns

- `station_data`: A pandas DataFrame that contains the cleaned data for the specified station.

### Functionality

The function works by filtering the raw data for the specified station. It then performs any necessary cleaning operations, such as removing missing values, outliers, or incorrect data.

The function then returns the cleaned data for the specified station.

Please note that the actual code for the `station_data_clean` function is not provided, so the parameters and functionality are assumed based on typical usage. If you provide the actual code of the `station_data_clean` function, I can give a more accurate explanation.

In [49]:
# -*- coding: utf-8 -*-
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import datetime as dt
import statistics as stats
import os


today = dt.date.today()
# today = datetime(2009, 7, 6, 0, 0)
strtoday = today.strftime("%Y%m%d")
print ('Today is :: ',today)



def readraw_data(destination, filename, foutname, st_date, ed_date):
    print ('File name is ::',filename)
    # open the file
    # TODO delimeter should be changed to ',' for the csv file.  ts <=> Timestamp.  delimiter=',' <=> delimiter=';'
    raw_data = pd.read_csv(filename,delimiter=',', parse_dates=['ts'],
                          dtype={'data_soil_moisture1':np.float64,
                                 'data_soil_moisture2':np.float64,
                                 'data_soil_moisture3':np.float64,
                                 'data_soil_moisture4':np.float64},
                          na_values=['Invalid data']
                          )
    raw_columns = raw_data.columns.tolist()
    # raw_data['Timestamp'] = pd.to_datetime(raw_data['Timestamp'])
    # drop the temperature data from the list
    for i in range(len(raw_columns)):
        if 'tem' in raw_columns[i] or 'dev' in raw_columns[i]:
            raw_data = raw_data.drop(columns=raw_columns[i])
            
    raw_data.set_index(['ts'])
    print(raw_data.info())
    
    # change the name of the Entity Name column
    raw_data['Entity Name'] = raw_data['Entity Name'].str.replace('ABE-DRAGINO-GROPOINT-CHERKHAUER-ACRE-','')
    stationlist = sorted(raw_data['Entity Name'].unique())
        
    
    # after checking the null values, filter the data
    clean_df = pd.DataFrame(columns = ['Station','Layer1', 'Layer2', 'Layer3', 'Layer4'])
    for station in stationlist:
        # clean the data according to the cleaning procedure
        sample_df = station_data_clean(destination, raw_data, station)
        
        clean_df = pd.concat([clean_df,sample_df])
        
    clean_df.to_csv(destination+foutname,index=True)
    
    # convert headers only with numbers
    raw_columns = raw_data.columns.tolist()
    
    # group the dataframe and turn them into another dataframe with 'Entity Name' as columns.
    # this step should be done by layer.
    

    return raw_data, raw_columns, clean_df, stationlist, raw_columns



# ********************************************************************************************************************
# * this is the function that goes into readraw_data function.                                                       *
# ********************************************************************************************************************
def station_data_clean(destination, raw_data, station):
    global start_date, end_date
    '''
    This is cleaning for station data

    Parameters
    ----------
    raw_data : dataframe
        dataframe of the raw_data, 'raw_data' in this script
    station : string
        this is number of station. 4 digit number filled with zero from left

    Returns
    -------
    sample_df : dataframe
        this cleaned data after process

    '''
    sample_df = raw_data[raw_data['Entity Name']==station]
    sample_df = sample_df.drop(['Entity Name'], axis=1)
    sample_df = sample_df.set_index('ts')
    sample_df = sample_df.set_axis(['Layer1', 'Layer2', 'Layer3', 'Layer4'], axis=1)
    
    # clean the data by time of interest
    # datetime range should start by 18:00 // end by 6:00 for better analysis for everyday
    start_date = dt.datetime(st_date[0], st_date[1], st_date[2], 0, 0, 0)
    end_date = dt.datetime(ed_date[0], ed_date[1], ed_date[2], 6, 0, 0)
    sample_df = sample_df[(sample_df.index > start_date) & (sample_df.index < end_date)]
    
    # according to the number of the data length,
    # if it is more than 1, the dataframe will be made
    # if it is 0, below process will be skipped
    totnum = len(sample_df)
    if totnum > 0:    
        print ('\n\n')
        txt = ' raw_data info for station '+station+' '
        print(txt.center(60,'='),end='\n')
        
        # if value is null value from beginning
        print(' NaN values info '.center(60,':'))
        na_df = sample_df[sample_df.isna().any(axis=1)]
        sample_df = sample_df.dropna()
        print('NaN values are ::',len(na_df),'out of',totnum,'\nerror rate:',round(len(na_df)/totnum*100,2),'%',end='\n\n')
        
        # if value is out of range
        sample_df[(sample_df>=100.0) | (sample_df<=0.0)] = np.nan
        outrange_df = sample_df[sample_df.isna().any(axis=1)]
        sample_df = sample_df.dropna()
        print('Out of range values are ::',len(outrange_df),'out of',totnum,'\nerror rate:',round(len(outrange_df)/totnum*100,2),'%',end='\n\n')
        
        print(' Data Describe '.center(60,':'),end='\n\n')
        print(sample_df.dtypes, end='\n\n')
        print(sample_df.describe())
        
        # resampling in 30 min frequency
        sample_df = sample_df.resample('30min').mean()
        
        # make boxplot per layer
        boxplot = sample_df.boxplot(column=['Layer1', 'Layer2', 'Layer3'],figsize=(8,4), ylabel='Soil Moisture (%)')
        plt.title(int(station))
        plt.savefig(destination+'/Soil_moisture_graphs/'+strtoday+'_'+station+'.png',dpi=600)
        # plt.show()
        
        # add station code back
        sample_df['Station'] = int(station)
    
        print('='*60)
        
    else:
        txt = station+' '
        print(txt.center(60,'='),end='\n\n')
    '''
    # save station dataframe as csv file == turned off for cal_deficit
    try:
        sample_df.to_csv(destination+'station_data/'+station+'_cleaned.csv',sep=',')
        print('Transmitter data exported.')
    except:
        print('Transmitter data export failed.')
    '''
    
    return sample_df

Today is ::  2024-05-16


## Rainfinder Function

The `rainfinder` function is used to identify significant rain events in a given dataset. The function is designed to analyze weather data and detect periods of rainfall based on certain conditions or thresholds.

### Parameters

- `data`: A pandas DataFrame that contains the weather data to be analyzed. The DataFrame should be indexed by 'Timestamp and Station'.
- `station`: An integer that represents the station ID.
- `header`: A list that contains the headers of the data.
- `threshold_moist`: An integer that represents the threshold of soil moisture difference to decide if it was a significant rain event or not.
- `raintimestep`: An integer that represents the number of time steps to consider for the rolling window.

### Returns

- `bumplist2`: A list of lists. Each inner list represents a date (in the format [year, month, day]) when a significant rain event (or "bump") was detected.

### Functionality

The function works by first filtering the data for the specified station and removing any missing values. It then calculates the difference in soil moisture between each time step and filters out the time steps where the difference is greater than the specified threshold.

The function then uses a rolling window to find the minimum and maximum soil moisture values within each window. It calculates the gap between the min and max values and filters out the time steps where the gap is greater than the threshold.

The function then organizes the dates of these significant rain events, removes duplicates, and sorts the list. The sorted list of dates is returned.

The function uses pandas for data manipulation and filtering.

In [28]:
def rainfinder(data, station, header, threshold_moist, raintimestep):
    """
    

    Parameters
    ----------
    data : dataframe
        organized dataframe / index is 'Timestamp and Station'
    stationlist : list
        list of the stations
    header : list
        header of the data
    threshold_moist : int
        threshold of soil moisture difference to decide it was huge rain event or not
        
    Returns
    -------
    data_dur_all : list of dataframe
        this is dataframes with time range of interest
        dataframe order is accordance with order of station list (stationlist)

    """
    # TODO: remove stationlist for the final version
    text = station +' :: '+target+'  Rainfinder'
    print (text.center(60,':'))

    data_dur = data[data['Station']==int(station)]
    data_dur = data_dur.dropna()
    
        
    # make a rolling window column from interpolation
    # data_dur[target] = data_dur[target].interpolate()
    print (station,'|','LEN =', len(data_dur))
    
    # find differene between each time step
    temp = data_dur[target].diff()
    filtered_temp = temp.to_frame(name=target).query('{target} > @threshold_moist'.format(target = target))
    
    # find biggest difference within 3 hours (6 timesteps) == bump
    data_dur['min_'+target] = data_dur[target].rolling(window=raintimestep).min()
    data_dur['max_'+target] = data_dur[target].rolling(window=raintimestep).max()
    data_dur['gap_'+target] = data_dur['max_'+target] - data_dur['min_'+target]
    filtered_temp = data_dur.query('gap_{target} > @threshold_moist'.format(target = target))
    #print (filtered_temp)

    # organizing the date of bump
    bumplist = filtered_temp.index.to_list()        
    for i in range(len(bumplist)):
        # make a list of date with the bump in the form of [year, month, day]
        bumplist[i] = [bumplist[i].year, bumplist[i].month, bumplist[i].day]
    # remove duplicates in bumplist
    bumplist2 = list(set(map(tuple, bumplist)))
    # sort the list and convert tuples into list
    bumplist2 = sorted(bumplist2)
    for i in range(len(bumplist2)):
        bumplist2[i] = list(bumplist2[i])

    print ('Searched ', len(bumplist2), 'days with bump events')
    print (bumplist2)

    return bumplist2



## FCfinder Function

The `FCfinder` function is used to find the field capacity of soil after a given number of days from a specified start date. Field capacity is the amount of soil moisture or water content held in the soil after excess water has drained away and the rate of downward movement has decreased. This capacity is reached within 2–3 days after rain or irrigation in typical soil conditions.

### Parameters

- `data`: A pandas DataFrame that contains the data to be analyzed. The DataFrame should be indexed by 'Timestamp and Station'.
- `station`: An integer that represents the station ID.
- `header`: A list that contains the headers of the data.
- `start_date`: A list that contains the start date of the data search in the format [year, month, day].
- `search_days`: An integer that represents the number of days to search for field capacity.
- `search_range`: An integer that represents the range (in hours) for moving field capacity.
- `search_slope`: A float that represents the slope for searching.
- `threshold_hour`: An integer that represents the threshold hours that remains flat soil moisture behavior to confirm it is a field capacity point.

### Returns

- `data_dur_all`: A list of pandas DataFrames. Each DataFrame contains a time range of interest. The order of the DataFrames corresponds to the order of the station list.

### Functionality

The function works by iterating over a specified number of days from the start date. For each day, it identifies a time range around midnight and checks if the standard deviation of the soil moisture within this time range is less than the specified slope. If it is, the function considers this as a constant moisture behavior and records the date and the mean soil moisture value. The function then plots the soil moisture and the field capacity points, saves the plot as a PNG file, and writes the field capacity records to a CSV file.

The function uses a rolling window to smooth the soil moisture data and uses the standard deviation to identify constant moisture behavior. It also uses matplotlib for plotting and os for file operations.

In [48]:
def FCfinder(data, station, header, start_date, search_days, search_range, search_slope, threshold_hour):
    global searched, precip_search1, precip_search2, precip_delta, precip_max

    # print ('Station',station, 'Date',start_date)
    # important == rl means rolling window. if you don't want, remove it.
    search_target = target
    # search_target = 'rl_'+target
    layer_index = int(target[-1])-1

    data_dur_all = [] # this is test list to save all the data_dur searched
    data_dur = data[data['Station']==int(station)]
    
        
    # make a rolling window column from interpolation
    data_dur['Layer1'] = data_dur['Layer1'].interpolate()
    data_dur['Layer2'] = data_dur['Layer2'].interpolate()
    data_dur['Layer3'] = data_dur['Layer3'].interpolate()
    data_dur['Layer4'] = data_dur['Layer4'].interpolate()
    # data_dur['rl_'+target] = data_dur[target].rolling(window=3).mean()
    

    '''
    From here, we need to find out point where soil moisture value is consistent for 2 hours (threshold_hour)
    And consistent variation for this hour is < 1% (search_slope)
    Searching range will be +- 4hrs from midnight (search_range)
    '''
    # set the database only for the search_target
    target_data = data_dur[search_target]
    
    # find 00:00 hour of everyday within TOI (7 days)
    dayrange = search_days
    
    # this list is for the searched fc dates
    searched_list = []
    
    # searching field capacity (daily)
    for days in range(1,dayrange+1):
        # Going to find the field capacity after given days from the start_date
        st_datetime = dt.datetime(start_date[0], start_date[1], start_date[2], 6, 0, 0)

        # stamp2 is 4 hours before the midnight of the stamp1
        # stamp3 is 4 hours after the midnight of the stamp2
        # add 'days' to search every daily step
        stamp1 = st_datetime + dt.timedelta(days=days)
        stamp1 = stamp1.replace(hour=0, minute=0, second=0)
        
        # step 1. select the time range of interest
        # once you find the time, you will search there is constant soil moisture behavior or not
        # set the searching time
        stamp2, stamp3 = stamp1 - dt.timedelta(hours=search_range), stamp1 + dt.timedelta(hours=search_range)
        # stamp3 = stamp1 + dt.timedelta(hours=search_range)
        
        # step 2. set the searching range
        searched = target_data[(target_data.index > stamp2) & (target_data.index < stamp3)]
        # searched = data_search
            
        try:
            # if standard deviation does not exceeds 'search_slope',
            # this will be regarded as constant moisture behavior
            if stats.stdev(searched) < search_slope:
                searched_list.append([stamp1, stats.mean(searched)])
                FC_searched = True
            else:
                FC_searched = False # if there is no constant behavior, FC_searched will be False == no field capacity
        except:
            FC_searched = False # this case is no data in the range, so FC_searched will be False == no field capacity
        
        if FC_searched == True:
            print ('Field Capacity is found for',str(start_date[0])+'-'+str(start_date[1])+'-'+str(start_date[2]),':',searched_list[0][0].strftime('%Y-%m-%d'),round(searched_list[0][1],3))

        
            # list of searched field capacity this value will be only one pair.
            searched_dates = [i[0] for i in searched_list]
            searched_values = [i[1] for i in searched_list]
            
            # if field capacity is found, draw graph
            if len(searched_dates) > 0:
                # draw overlaying graph for the soil moisture and field capacity
                # soil moisture is blue line and field capacity is red dot            
                df_subset = data_dur[(data_dur.index < stamp1+dt.timedelta(days=1)) & (data_dur.index > st_datetime)]
                # print(df_subset)

                fig, ax = plt.subplots(figsize=(12, 5))
                # ax = data_dur.plot(y=target, label = 'Observed values', figsize=(15,5))
                ax = plt.plot(df_subset.index, df_subset[target], label = 'Observed values')

                # ax.set_xlim(stamp2, stamp3)
                plt.scatter(x=searched_dates, y=searched_values, label = 'Moving FC points', marker='s', c='r', s=100)
                plt.legend()
                plt.gcf().autofmt_xdate()
                plt.xlabel('Date')
                plt.ylabel('Soil Moisture (%)')
                plt.title('Field Capacity for '+str(station)+' after '+st_datetime.strftime('%Y-%m-%d'))

                # save graph for the soil moisture and field capacity
                graph_dest = os.getcwd()+'/fc_graphs/'+st_datetime.strftime('%Y-%m-%d')
                if not os.path.exists(graph_dest):
                    os.makedirs(graph_dest)
                plt.savefig(graph_dest+'/'+str(station)+'_'+stamp1.strftime('%Y-%m-%d')+'.png',dpi=600)
                # plt.show()
                print('Graph is saved.')
            
            # save field capacity records in csv form
            data_dur_all.append(data_dur)
            
            
            # save field capacity records in csv form
            # print('searched_list ::',searched_list)
            with  open(os.getcwd()+'/fc_results/'+strtoday+'_fc_record.csv', 'a+') as f:
                for item in searched_list:
                    strdate = item[0].strftime('%Y-%m-%d')
                    sm_list = data_dur[data_dur.index==item[0]].iloc[0,:].values.tolist()[1:]
                    sm_list = [str(round(i,3)) for i in sm_list]
                    sm_list_str = ','.join(map(str, sm_list))
                    f.write(str(station)+','+strdate+','+sm_list_str+'\n')
            
            break


In [30]:
def update_fc():
    filename = os.getcwd()+'/fc_results/'+strtoday+'_fc_record.csv'
    # open the daily fc file & read
    raw_data = pd.read_csv(filename,delimiter=',', parse_dates=['Date'],
                          dtype={'data_soil_moisture1':np.float64,
                                 'data_soil_moisture2':np.float64,
                                 'data_soil_moisture3':np.float64,
                                 'data_soil_moisture4':np.float64},
                          na_values=['Invalid data']
                          )
    raw_data = raw_data.dropna()    # drop the rows if there is NaN values
    raw_data.drop_duplicates(inplace=True)  # remove duplicate rows
    raw_data.to_csv(filename,index=False)   # export the data to csv file    
    raw_data = raw_data.sort_values(by=['Station','Date'])  # sort the data by station and date
    last_date = raw_data.groupby('Station').tail(1) # get the last date of the data

    # open the 00_Current_FC.csv file & read
    fc_data = pd.read_csv(os.getcwd()+'/00_Current_FieldCapacity.csv',delimiter=',', parse_dates=['Date'],
                          dtype={'data_soil_moisture1':np.float64,
                                 'data_soil_moisture2':np.float64,
                                 'data_soil_moisture3':np.float64,
                                 'data_soil_moisture4':np.float64},
                          na_values=['Invalid data']
                          )
    fc_data = pd.concat([fc_data,raw_data]) # concat fc_data and raw_data
    fc_data = fc_data.drop_duplicates() # remove duplicate rows
    fc_data = fc_data.sort_values(by=['Station','Date']) # sort the data by station and date
    fc_data = fc_data.groupby('Station').tail(1) # get the last date of the data and update the data
    fc_data.to_csv(os.getcwd()+'/00_Current_FieldCapacity.csv',index=False) # export the data to csv file
    

# About the 'Defecit Calc' function
## About
***


The provided Python code defines two functions: `deficit_calc` and `deficit_equation`.

1. `deficit_calc(thedate, rootdpth)`: This function calculates the soil moisture deficit for a given date and root depth across multiple stations. 

   - **Parameters**: 
     - `thedate`: The date of interest.
     - `rootdpth`: The root depth.

   - **Process**: 
     - It loops over a list of stations (`stationlist`), which is not defined in the provided code.
     - For each station, it filters the `raw_data` DataFrame (also not defined in the provided code) to get data for that station and drops any rows with missing values.
     - It then tries to find the soil moisture value (`sm_val`) for the date of interest and the field capacity value (`fc_val`) for the station from a CSV file.
     - It calls the `deficit_equation` function to calculate the deficit and prints the result.
     - If there's no data for a station, it prints a message indicating this.

2. `deficit_equation(ts, station, rootdpth, sm_val, fc_val)`: This function calculates the soil moisture deficit for a given station on a specific date. 

   - **Parameters**: 
     - `ts`: The date.
     - `station`: The station.
     - `rootdpth`: The root depth.
     - `sm_val`: The soil moisture value.
     - `fc_val`: The field capacity value.

   - **Process**: 
     - It calculates the deficit based on the root depth and the difference between the field capacity and the soil moisture value for different soil layers. The calculation is done in centimeters (hence the multiplication by 0.01).
     - It rounds the deficit to two decimal places and returns it.

The commented-out code at the top appears to be an earlier version of the deficit calculation, which is now performed by the `deficit_equation` function.

In [46]:
'''
#! Laura's equation for the defecit calculation
df2.loc[df2['soil_moisture1'] < fcdf.fc1, 'd1']=fcdf.fc1-df2['soil_moisture1']
df2.loc[df2['soil_moisture2'] < fcdf.fc2, 'd2']=fcdf.fc2-df2['soil_moisture2']
df2.loc[df2['soil_moisture3'] < fcdf.fc3, 'd3']=fcdf.fc3-df2['soil_moisture3']
df2.loc[df2['soil_moisture4'] < fcdf.fc4, 'd4']=fcdf.fc4-df2['soil_moisture4']

if root < 6:
    df2['Deficit'] = (df2['d1'])*root
elif root < 12:
    df2['Deficit'] = (df2['d1'])*6 + (df2['d2'])*(root-6)
elif root < 18:
    df2['Deficit'] = (df2['d1'])*6 + (df2['d2'])*6 + (df2['d3'])*(root-12)
else:
    df2['Deficit'] = (df2['d1'])*6 + (df2['d2'])*6 + (df2['d3'])*6 + (df2['d4'])*(root-18)

'''
def deficit_calc (thedate, rootdpth):
    
    # read the soil moisture values from raw_data close to the midnight
    # date of the interest is ts
    # ts = dt.date.today()
    ts = thedate
    # ts = pd.to_datetime(today)
    print('Date of the interest is ',ts)

    # open the file
    f = open(os.getcwd()+'/Deficit_results/'+strtoday+'_deficit_report.csv','w') # create the file
    f.write('Date,Station,Deficit,sm_val1,sm_val2,sm_val3,sm_val4,fc_date,fc_val1,fc_val2,fc_val3,fc_val4\n') # write the header

    for station in stationlist:

        # fliter the raw_data by station and drop the NaN values
        sample_df = raw_data[raw_data['Entity Name']==station].dropna()
        sample_df.set_index('ts', inplace=True)
        
        try:
            # get the midnight value of the date of the interest
            iloc_idx = sample_df.index.get_indexer([ts], method='nearest')  # returns absolute index into df e.g. array([5])
            loc_idx = sample_df.index[iloc_idx]                             # if you want named index
            sm_val = sample_df.iloc[iloc_idx]
            sm_val = sample_df.loc[loc_idx]                                 # as above so below...    
            
            # convert my_val to list
            sm_val = sm_val.values.tolist()[0]
            print('sm_val',sm_val)

            # get the field capacity value from the 00_Current_FieldCapacity.csv
            fcdf = pd.read_csv(os.getcwd()+'/00_Current_FieldCapacity.csv')
            fc_val = fcdf[fcdf['Station']==int(station)].values.tolist()[0]
            print('fc_val',fc_val)
            
            deficit = deficit_equation(ts,station, rootdpth, sm_val, fc_val)
            
            print('>>> Deficit for',ts,'at',int(station),'is',deficit,'& FC date is:',fc_val[1],end='\n\n')
            f.write(str(station)+','+str(ts)+','+str(deficit)+','+','.join(str(x) for x in sm_val[1:])+','+str(fc_val[1])+','+','.join(str(x) for x in fc_val[2:])+'\n')

        
        except:
            txt = 'No data for  '+station
            print(txt.center(60,'='),end='\n\n')
        
    f.close() # close the deficit report file 



def deficit_equation(ts,station, rootdpth, sm_val, fc_val):
    # TODO 0-1
    # TODO make timesseries for the deficit
    if rootdpth < 6:
        deficit = ((fc_val[2]-sm_val[1])*rootdpth)*0.01
    elif rootdpth < 12:
        deficit = ((fc_val[2]-sm_val[1])*6 + (fc_val[3]-sm_val[2])*(rootdpth-6))*0.01
    elif rootdpth < 18:
        deficit = ((fc_val[2]-sm_val[1])*6 + (fc_val[3]-sm_val[2])*6 + (fc_val[4]-sm_val[3])*(rootdpth-12))*0.01
    else:
        deficit = ((fc_val[2]-sm_val[1])*6 + (fc_val[3]-sm_val[2])*6 + (fc_val[4]-sm_val[3])*6 + (fc_val[5]-sm_val[4])*(rootdpth-18))*0.01
    deficit = round(deficit,2)
    return deficit


# Below is implementation Block
- **Station List**: This line prints the list of stations to the console.

- **Target Layer**: This is the target layer for the analysis. The variable `target` is set to 'Layer2'.

- **Rainfinder Variables**: The variables `threshold_moist` and `raintimestep` are set for the `rainfinder` function. `threshold_moist` is the soil moisture difference threshold to decide if a significant rain event has occurred. `raintimestep` is the time step for rain detection, where 1 represents half an hour and 12 represents 6 hours.

- **FCfinder Variables**: The variables `search_days`, `search_range`, `search_slope`, and `threshold_hour` are set for the `FCfinder` function. `search_days` is the number of days to search for field capacity. `search_range` is the number of hours to search around a given timestamp. `search_slope` is the standard deviation threshold for soil moisture behavior. `threshold_hour` is the number of hours with consistent soil moisture behavior.

- **File Creation**: A new CSV file is created in the `fc_results` directory to store the daily field capacity results. The file is named with the current date (`strtoday`). The first line of the file is a header line with the column names.

- **Main Loop**: This is the main loop of the program. For each station in the `stationlist`, it prints the station ID and target layer. It then calls the `rainfinder` function to get a list of dates (`bumplist`) where significant rain events occurred. For each of these dates, it calls the `FCfinder` function to estimate the field capacity.

In [50]:
import warnings
warnings.filterwarnings('ignore')

'''
Implementation Block

set up the variables for running the programs
you will set the destination of the files (directory)
you will set the date of your interest.
usually, it is set to today's date

'''

destination = os.getcwd()+'/'
today_dt = datetime.datetime.now()
today_dt = datetime.datetime(today_dt.year, today_dt.month, today_dt.day, 6, 00)
filename = destination + 'Raw_data/data-'+today_dt.strftime('%Y-%m-%d-%H-%M')+'.csv'
filename = destination + 'Raw_data/Calc_def_test_0725.csv'
# ! filename should be changed to the file name of the downloaded data
foutname = './Clean_data/SM_data'+strtoday+'.csv'

# set the time of interest and location [year, month, day]
#  this is for the data of soil moisture. = for the date of the analysis
# for regular 
ed_date = [dt.date.today().year, dt.date.today().month, dt.date.today().day]
# end date will be 
st_datetime = dt.datetime(ed_date[0], ed_date[1], ed_date[2], 6, 0, 0) - dt.timedelta(days=360)
st_date = [st_datetime.year, st_datetime.month, st_datetime.day]

raw_data, raw_columns, clean_df, stationlist, raw_columns = readraw_data(destination, filename, foutname, st_date, ed_date)



''' 
This is the main part of the program
below section will estimate the field capacity for the given data [Daily basis]

This part should be done per station [stationlist]
'''
# stationlist = ['0111','0112','0114']
print ('Station List ::',stationlist)

# this is the target layer for the analysis. 1=layer 1
# variable set for rainfinder
# *-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= CHECK THE PARAMETERS =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-* #
target = 'Layer2'
threshold_moist = 15    # threshold of soil moisture difference to decide it was huge rain event or not default: 15
raintimestep = 12 # 1 = 1/2 hour // 12 = 6 hours

# variable set for FCfinder
search_days = 14 # days
search_range = 5 # hours
search_slope = 0.07 # unit is fraction for standard deviation of the soil moisture behavior
threshold_hour = 4 # hours with consistent soil moisture behavior
# *-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= CHECK THE PARAMETERS =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-* #


f=open(os.getcwd()+'/fc_results/'+strtoday+'_fc_record.csv', 'w')   # making a file to save the daily field capacity
f.write('Station,Date,Layer1,Layer2,Layer3,Layer4\n')
f.close()

for station in stationlist:
    print('\n\n')
    text = station +' :: '+target
    print (text.center(60,'='))
    bumplist = rainfinder(clean_df, station, raw_columns, threshold_moist, raintimestep)
    for start_date in bumplist:
        FCfinder(clean_df, station, raw_columns, start_date, search_days, search_range, search_slope, threshold_hour)


update_fc() # update the field capacity data
print ('Field Capacity Update is Done.\n\n\n')


'''
this part will calculate the soil defecit based on the field capacity data exported from update_fc function
'''
rootdpth = 24
ts = dt.date.today()
ts = dt.date(2023, 7, 12)
ts = pd.to_datetime(ts)
deficit_calc(ts, rootdpth)

print ('*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*')
print ('Deficit Calculation is Done.\n')
print ("Check the ./fc_results/"+strtoday+"_fc_record.csv folder for the today's field capacity data.")
print ("Check the ./fc_graphs/ folder for the today's field capacity graph")
print ("Check the ./00_Current_FieldCapacity.csv file for the most updated field capacity data.")

File name is :: /Users/DK/Documents/GitHub/SoilMoisture_analysis/Raw_data/Calc_def_test_0725.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103509 entries, 0 to 103508
Data columns (total 6 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ts                   103509 non-null  datetime64[ns]
 1   Entity Name          103509 non-null  object        
 2   data_soil_moisture1  101602 non-null  float64       
 3   data_soil_moisture2  101602 non-null  float64       
 4   data_soil_moisture3  101602 non-null  float64       
 5   data_soil_moisture4  101602 non-null  float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 4.7+ MB
None



::::::::::::::::::::: NaN values info ::::::::::::::::::::::
NaN values are :: 45 out of 7924 
error rate: 0.57 %

Out of range values are :: 1783 out of 7924 
error rate: 22.5 %

:::::::::::::::::::::: Data Describe :::::::::::::::::::::::

Layer1  