#### Libraries

In [9]:
#analysis 
import statistics
import numpy as np
import pandas as pd
#plotting
import plotly.express as px
import plotly.graph_objects as go
from plotly.graph_objs import Data
from plotly.subplots import make_subplots

#### Paths and functions

In [10]:
import os
from pathlib import Path

path_cwd=Path.cwd()
path_input=str(path_cwd)+'/Data_input/'
path_current = str(path_cwd)
path_LATEST= str(Path.cwd().parents[1]) #moves two directories up
path_plots_ts= path_LATEST + "/ANALYSIS/Plots/Time_series/"
path_corr_sm = path_current + "/Data_output(corrected)/"

import sys

sys.path.append(str(path_cwd)+'/Functions')
import imputation
import regressions
import plotting

#### Column names 
For .csv files in Data_input

In [11]:
SOIL_cols=['Date','WC1','T1','EC1','WC2','T2','EC2','WC3','T3','EC3','BAT%','BATV']#WC: water content (m3/m3), T: temperature (C), EC: Saturation Extract (mS/cm),Battery percent (%), Battery Voltage(mV)
TDR_cols=['Date','WC1','WC2','WC3','R','events','power']

#### Inputfiles
List of tuples (name,tree,{'type':numerical_value*}) (*1 or 2 depending on instrument type)

In [12]:
from csv import DictReader, reader

with open(path_input+'inputfiles.csv', encoding='utf-8-sig') as read_obj:
    dict_reader = DictReader(read_obj)
    list_of_dict = list(dict_reader)

    name_=[[list_of_dict[i].pop(key) for i in range(len(list_of_dict))]for key in ["Filename","Tree"]] #extract string columns: Filename,Tree, Date (if applicable) .pop(key):removes specified element from list or dict 

    res = [dict([key, float(value)] for key, value in dicts.items()) for dicts in list_of_dict]#this will make all values integers ONLY WORKS if all values in file are numerical
    
    inputfiles=[(name_[0][i],name_[1][i],res[i]) for i in range(len(list_of_dict))] #list of tuples: [(name,tree,date,{Dict})]

In [13]:
display(inputfiles)

[('z122.csv', 'DF27US', {'Type': 1.0}),
 ('z123.csv', 'ES51US', {'Type': 1.0}),
 ('z129.csv', 'ES50LS', {'Type': 1.0}),
 ('TDR.csv', 'DF49GT', {'Type': 2.0})]

#### Main

- **Reading**
- **Cleaning**
- **Imputation of missing values (lr between complete and incomplete sensors within instruments)**

In [19]:
def read_impute_within(inputfiles):
    soil_m_original=[]
    soil_m_list=[]
    name_list=[]
    for i,(name,tree,options) in enumerate(inputfiles):
        #ZENTRA
        if options['Type'] == 1: #for 2022 options will be 'no_sens'==3 and reading outside options
            #read and clean 
            soil_gen=pd.read_csv(path_input+name,header = None, skiprows=1, names=SOIL_cols, engine='python') 
            soil_gen=(soil_gen[~soil_gen['Date'].str.contains("[a-zA-Z]").fillna(False)]).dropna(subset=['Date'])
            soil_gen.index = pd.to_datetime(soil_gen['Date'],errors='coerce')
            soil_gen.index=soil_gen.index.floor('H')
            soil_gen=soil_gen.loc['2021-05-01 00:00:00':] #crop from may 
            soil_gen['S1'],soil_gen['S2'],soil_gen['S3']=[soil_gen[col].astype(float) for col in ['WC1','WC2','WC3']] #make values float values
            soil_m=soil_gen.filter(['S1','S2','S3']).copy() #df only with soil moisture m3/m3
            soil_m_original.append(soil_gen.filter(['S1','S2','S3']))

            #fill missing values WITHIN INSTRUMENT with regression among SENSORS 
            from imputation import fill_missing 
            soil_m['newS1'],soil_m['newS2'],soil_m['newS3']=fill_missing(df=soil_m) # to display or save regression plots change return of both regression and fill missing functions 
            soil_m_list.append(soil_m)
            name_list.append(name)

        #TDR
        if options['Type'] == 2:
            soil_gen=pd.read_csv(path_input+name,header = None, skiprows=2, names=TDR_cols, engine='python') #reads file
            soil_gen['Date'] = pd.to_datetime(soil_gen['Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
            soil_gen.index = pd.to_datetime(soil_gen['Date'],errors='coerce') 
     
            fig = make_subplots(rows=3, cols=1)
            fig.append_trace(go.Scatter(x=soil_gen.index,y=soil_gen['WC1'],name="Sensor1"), row=1, col=1)
            fig.append_trace(go.Scatter(x=soil_gen.index,y=soil_gen['WC2'],name="Sensor2"), row=2, col=1)
            fig.append_trace(go.Scatter(x=soil_gen.index,y=soil_gen['WC3'],name="Sensor3"), row=3, col=1)
            fig.update_layout(height=900, width=1800, title_text="Soil_moisture"+name)

            soil_tdr=soil_gen.filter(['WC1','WC2','WC3']).rename(columns={"WC1": "S1", "WC2": "S2", "WC3": "S3"}) #df to export 
            soil_m_original.append(soil_tdr)
            
            #fill missing values WITHIN INSTRUMENT with regression among SENSORS 
            from imputation import fill_missing 
            soil_tdr['newS1'],soil_tdr['newS2'],soil_tdr['newS3']=fill_missing(df=soil_tdr)
            soil_m_list.append(soil_tdr)
            name_list.append(name)    

    return name_list,soil_m_list,soil_m_original

- **Saves output of read_... function into a dictionary with stripped names**

In [20]:
name_list,soil_m_list,soil_m_original=read_impute_within(inputfiles)

#NAMES OF SENSORS W/O .csv
dfname=[name_list[i].rstrip(".csv") for i in range(len(name_list))] #'z122', 'z123', 'z129', 'TDR'  

#dictionary of names and imputed values 
sensors = dict(zip(dfname,soil_m_list)) #dictionary with key: sensor_name, value: df_output of imputation
sensors_original = dict(zip(dfname,soil_m_original)) 

#visually inspect imputed elements of this dict
from plotting import plotting_dict
fig_dict=plotting_dict(dictionary=sensors,col1='newS1',col2='newS2',col3='newS3') #change function if size of dict changes
[fig.show() for fig in fig_dict]


[None, None, None, None]

- **use the above for:**
    1. imputation among instruments
    2. General output 

1. **Imputation among insturments:** 

    IF some instrument has all 3 sensors (or 1 or 2 in the case of 21_2022 and 42_2022) with missing values and another instrument has a similar pattern:
    RUN CODE BELOW if not skip. 

    User needs to specify this by visually inspecting the above data:
    - define instruments and single sensor to impute as location_cliff=[incomplete['sensor'],complete['sensor']]
    - once imputed apply the above function to a copy with only 3 sensors and no NaNs

In [16]:
assert False

AssertionError: 

In [None]:
#select instruments and sensors used for imputing:
# [incomplete instrument[sensor to replace],complete instrument[sensor to replace]] IF ORDER CHANGES ALL FUNCTIONS NEED TO BE MODIFIED 
upper_site=[sensors['z122']['newS1'],sensors['z123']['newS1']] #this is specific and defined by me visually 

#function to fill values of incomplete with complete 
from imputation import fill_among

val122=fill_among(upper_site)
sensors['z122']['newS1']=val122 

#reapply function to fill the other sensors within incomplete insturment 
z122_copy_imp=(sensors['z122'].drop(columns=['S1','S2','S3']).rename(columns={'newS1': 'S1', 'newS2': 'S2', 'newS3': 'S3'})).fillna(0).copy() #need to replace nans with zero for my original function to work
from imputation import fill_missing

sensors['z122']['newS1'],sensors['z122']['newS2'],sensors['z122']['newS3']=fill_missing(df=z122_copy_imp)

#visually inspect 
fig_dict=plotting_dict(dictionary=sensors,col1='newS1',col2='newS2',col3='newS3')
[fig.show() for fig in fig_dict]

NameError: name 'sensors' is not defined

2. **General output**
    - Output timeseries plot 
    - Output files 

In [None]:
#filter sensor dictionary for final output (also output sensors_original)
sensors_output=sensors.copy()
for key in sensors_output:
    sensors_output[key]=sensors_output[key].drop(columns=['S1','S2','S3']).rename(columns={'newS1': 'S1', 'newS2': 'S2', 'newS3': 'S3'})
    


#OUTPUT 
#paths
path_corr_sm = path_current + "/Data_output(corrected)/" #to data output 
path_sf_in_2021= path_LATEST + "/SAPFLOW/SF_2021/Data_input/" #to data input of Sapflow 
path_plots= path_LATEST + "/ANALYSIS/Plots_2021/Time_series/" #to analysis 
path_analysis= path_LATEST + "/ANALYSIS/CORRECTEDLOOP_2021/Data_input/"


#time series plots
fig_original=plotting_dict(dictionary=sensors_original,col1='S1',col2='S2',col3='S3')
[fig.write_html(path_plots + key +'_original'+'.html') for key,fig in zip(sensors_original,fig_original)]

fig_complete=plotting_dict(dictionary=sensors_output,col1='S1',col2='S2',col3='S3')
[fig.write_html(path_plots + key +'_complete'+'.html') for key,fig in zip(sensors_output,fig_complete)]

#files 
for key in sensors_output: 
    for path_ in [path_corr_sm,path_sf_in_2021]:
        sensors_output[key].to_csv(path_+key+'.csv' , index= True)





KeyboardInterrupt



In [None]:
[fig.show() for fig in fig_complete]

In [None]:
assert(False)

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model
model = ARIMA(data, order=(1, 1, 1))
results = model.fit()

# Make predictions
forecast = results.predict(start=len(data), end=len(data) + 11)

# Plot the forecast
plt.figure(figsize=(10, 6))
sns.lineplot(data=data)
sns.lineplot(x=forecast.index, y=forecast)
plt.title('Time Series Forecast')
plt.xlabel('Year')
plt.ylabel('Value')
plt.show()

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model
for key in sensors_original:
    print(sensors_original[key])
    #model = ARIMA(sensors_original[key], order=(0, 1, 0))
#results = model.fit()


In [None]:
#example graph for a water content in a 30(len)*10(width)*5(depth) cm3 crack = 0.0015 m3 in L (1L=1dm3 & 1000dm3= 1m3) i.e val in cm3 divide by 1e3 to get dm3
for key in sensors_output: 
    if key=='z122':
        val1=((50*50*5)/1e3) #dm3/h because we measure each value every h 
        val2=((100*50*5)/1e3)
        val3=((1000*50*5)/1e3)
    if key=='z123':
        val1=((100*50*5)/1e3)
        val2=((1000*50*5)/1e3)
        val3=((60*50*5)/1e3)
    if key=='z129':
        val1=((30*10*5)/1e3)
        val2=((30*5*5)/1e3)
        val3=((50*10*5)/1e3)
    if key=='TDR':
        val1=((1000*1000*10)/1e3)
        val2=((1000*1000*10)/1e3)
        val3=((1000*1000*10)/1e3)

    fig = make_subplots(rows=4, cols=1)
    fig.append_trace(go.Scatter(x=sensors_output[key].index,y=sensors_output[key]['S1']*val1,name="Sensor1 (L)"), row=1, col=1) #fig.append_trace(go.Scatter(x=soil_m.index,y=soil_m['S1'],name="Sensor1_orig"), row=2, col=1) 
    fig.append_trace(go.Scatter(x=sensors_output[key].index,y=sensors_output[key]['S2']*val2,name="Sensor2 (L)"), row=2, col=1)
    fig.append_trace(go.Scatter(x=sensors_output[key].index,y=sensors_output[key]['S3']*val3,name="Sensor3 (L)"), row=3, col=1)
    fig.append_trace(go.Scatter(x=sensors_output[key].index,y=(sensors_output[key]['S1']*val1)+(sensors_output[key]['S2']*val2)+(sensors_output[key]['S3']*val3),name="SUM (L)"), row=4, col=1)
    fig.update_layout(height=900, width=1800, title_text="Crack_moisture"+'_'+key)
    fig.show()

- **Normalized values**

In [None]:

sensors_normalized=sensors_output.copy()     

for key in sensors_output:   
        for col in sensors_output[key].columns:   #normalize each df in dict  
                vals_=sensors_output[key][col].values.copy() 
                min_v=np.nanmin(vals_)
                max_v=np.nanmax(vals_)
                vals_norm=(vals_-min_v)/(max_v-min_v)
                sensors_normalized[key][col]=vals_norm
        
#visually inspect 
fig_dict=plotting_dict(dictionary=sensors_normalized,col1='S1',col2='S2',col3='S3')
#[fig.show() for fig in fig_dict]

#OUTPUT 
path_sf_in_2021= path_LATEST + "/SAPFLOW/SF_2021/Data_input/" #to data input of Sapflow 
for key in sensors_normalized: 
    for path_ in [path_corr_sm,path_sf_in_2021]:
        sensors_normalized[key].to_csv(path_+key+'_normalized.csv' , index= True)
      

    

In [None]:
assert(False)

#### Extra 

In [None]:
from scipy.ndimage import gaussian_filter1d
gaussian_filter1d([1.0, 2.0, 3.0, 4.0, 5.0], 1)

In [None]:
#attempt to do gaussian convolution 
# http://csundergrad.science.uoit.ca/courses/cv-notes/notebooks/03-linear-filtering.html
import numpy as np
import plotly.express as px


x=[1.0, 2.0, 3.0, 4.0, 5.0]
def gauss(x,sigma):
    x=np.array([x])
    g_x=[(1/(sigma*np.sqrt(2*np.pi)))*((np.e)**(-((x[i]**2)/(2*(sigma**2))))) for i in range(len(x))] #normal distribution 
    
    kernel=[g_x[0][0],g_x[0][1],g_x[0][2]]/np.sum([g_x[0][0],g_x[0][1],g_x[0][2]])
    result=np.convolve(x[0],kernel)
    return g_x,result

g_x,conv_=gauss(x,1)
print(conv_)
fig = px.line()
#fig.add_scatter(x=x,y=g_x,name="gauss")

In [None]:
#example trying to permute sensors 
for i,(name,options) in enumerate(inputfiles):
    if options['ts'] == 1:
        
        #read and clean 
        soil_gen=pd.read_csv(path_input+name,header = None, skiprows=1, names=SOIL_cols, engine='python') #reads file
        soil_gen=(soil_gen[~soil_gen['Date'].str.contains("[a-zA-Z]").fillna(False)]).dropna(subset=['Date'])
        soil_gen.index = pd.to_datetime(soil_gen['Date'],errors='coerce')# round and resample to not raise error 
        soil_gen.index=soil_gen.index.floor('H')
        soil_gen=soil_gen.loc['2021-05-01 00:00:00':] #crop from may 
        

        soil_gen['S1'],soil_gen['S2'],soil_gen['S3']=[soil_gen[col].astype(float) for col in ['WC1','WC2','WC3']] #make values float values
        soil_m=soil_gen.filter(['S1','S2','S3']) #df only with soil moisture m3/m3

        import itertools
        perm_=list(itertools.permutations(soil_m.columns,3))
        #print(len(perm_))
        for i in range(len(perm_)):
            for j in range(len(perm_[i])-2):
                print(perm_[i][j],perm_[i][j+1],perm_[i][j+2])
    

In [None]:
#EXAMPLES ON HOW TO USE .LOC

#slice 
#soil_m.loc['2021-05-01 02:00:00':'2021-10-21 11:00:00','S1':'S2'] 

#boolean array 
#soil_m.loc[:,[True,False,True]]

#type(soil_m.loc['2021-05-01 02:00:00',['S1']])

#n_=soil_m.loc[:]!=0
#print(n_.all())

#df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
#     index=['cobra', 'viper', 'sidewinder'],
#     columns=['max_speed', 'shield'])

#notation 
#df.loc[:,'max_speed'] #returns a pd.series 
#df.loc[:,['max_speed']] #returns a data frame

# Alignable index
#df.loc[pd.Index(["cobra", "viper"], name="foo")]
#df.loc[pd.Index(["viper", "cobra"])]

#Alignable series
#df.loc[pd.Series([False, True, False], index=['viper', 'sidewinder', 'cobra'])]

#condition returns a boolean Series with column labels specified
#df.loc[df['shield']>6,'max_speed']