In [1]:
#general libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

#plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
#paths to input and output files
from pathlib import Path

path_cwd=Path.cwd()
path_input=str(path_cwd)+'/Data_input/'
path_output=str(path_cwd)+'/Data_output/'

# get path 2 directories up
path_up2=str(path_cwd.parent.parent)
path_analysis= str(path_up2)+'/SM_analysis/SM_2021_A/Data_input/'

In [3]:
#functions
import sys
sys.path.append(str(path_cwd)+'/Functions')

In [4]:
#config file for reading
import tomli 

with open(path_input+"config_2021.toml", "rb") as f: #rb:read binary, f: file
    d=tomli.load(f)

inputfiles=[(d['Filename'][i],d['Tree'][i],d['sensor_type'][i]) for i in range(len(d['Filename']))]

In [5]:
from functions import read_sensor, fill_within
name_sites=[]
df_sites=[]

for i,(name,tree,options) in enumerate(inputfiles):
    if options['Type'] == 1: 
        sm=read_sensor(path_input,'tdr',name,d['column_names']['tdr_cols_1'],options['Type'])
        sm.to_csv(path_output+'SM_2021_'+tree+'_gt'+'.csv',index=True,header=True)
        #sm.to_csv(path_analysis+tree+'_gt'+'.csv',index=True,header=True)

    if options['Type'] == 3:
        sm=read_sensor(path_input,'zentra',name,d['column_names']['zentra_cols_3'],options['Type'])
    
    sm_filled=fill_within(sm)


    if options['Site'] == 1: #do not include GT for regression between 
        name_sites.append(tree+'_us')
        df_sites.append(sm_filled)
    if options['Site'] == 2:
        name_sites.append(tree+'_ls')
        df_sites.append(sm_filled)

#plotting
    fig = make_subplots(rows=6, cols=1)
    fig.append_trace(go.Scatter(x=sm_filled.index,y=sm_filled['S1'],name="Sensor1_imp"), row=1, col=1)
    fig.append_trace(go.Scatter(x=sm.index,y=sm['S1'],name="Sensor1_o"), row=2, col=1)
    fig.append_trace(go.Scatter(x=sm_filled.index,y=sm_filled['S2'],name="Sensor2_imp"), row=3, col=1)
    fig.append_trace(go.Scatter(x=sm.index,y=sm['S2'],name="Sensor2_o"), row=4, col=1)
    fig.append_trace(go.Scatter(x=sm_filled.index,y=sm_filled['S3'],name="Sensor3_imp"), row=5, col=1)
    fig.append_trace(go.Scatter(x=sm.index,y=sm['S3'],name="Sensor3_o"), row=6, col=1)
    fig.update_layout(height=900, width=1800, title_text="smoisture"+'_'+tree)
    fig.show()

In [6]:
# find complete and incomplete dfs 
sensors = dict(zip(name_sites,df_sites))

incomplete_dfs = []
complete_dfs = []
names_incomplete=[]
names_complete=[]

for k in sensors.keys():
    if sensors[k].isna().any().any()==True: #if there is any NaN in the df
        incomplete_dfs.append(sensors[k])
        names_incomplete.append(k)
    else:
        complete_dfs.append(sensors[k])
        names_complete.append(k)


In [7]:
#do regression between complete and incomplete dfs and find best r2 combination

selected_pairs = {}
for i,name_i in zip(range(len(incomplete_dfs)),names_incomplete):
    incomplete_df = incomplete_dfs[i]
    incomplete_mask = incomplete_df[incomplete_df.columns].isnull().all(axis=1)
    selected_pairs[name_i] = {}

    for j,name_c in zip(range(len(complete_dfs)),names_complete):
        complete_df = complete_dfs[j]
        complete_cols = complete_df.columns

        for col_i in incomplete_df.columns: #inside this I have 2 cases for each pair incomplete-complete
            incomplete_col = incomplete_df[col_i]

            selec_r2 = 0
            selec_col_c = None
            selec_b = None
            selec_m = None
        

            for col_c in complete_df.columns:
                complete_col = complete_df[col_c]
 
                X = complete_col[~incomplete_mask].values.reshape(-1, 1)
                y = incomplete_col[~incomplete_mask].values
                lr = LinearRegression()
                lr.fit(X, y)                
                pred = lr.predict(X)
                r2 = r2_score(y, pred)
                b = lr.intercept_
                m = lr.coef_[0]

                if r2 > selec_r2:
                    selec_r2 = r2
                    selec_col_c = col_c
                    selec_b = b
                    selec_m = m
                
            #print(name_i, col_i, name_c, selec_col_c, selec_r2, selec_m, selec_b)

            if col_i not in selected_pairs[name_i]:
                selected_pairs[name_i][col_i] = {
                    'complete_name' : name_c,
                    'complete_col' : selec_col_c,
                    'r2': selec_r2,
                    'm': selec_m,
                    'b': selec_b, 
                }
            else:
                if selec_r2 > selected_pairs[name_i][col_i]['r2']:
                    selected_pairs[name_i][col_i] = {
                        'complete_name' : name_c,
                        'complete_col' : selec_col_c,
                        'r2': selec_r2,
                        'm': selec_m,
                        'b': selec_b
                    }

# # Print the selected pairs with the highest r2
# for name_i, pairs in selected_pairs.items():
#     for col_i, pair_info in pairs.items():
#         print(name_i, col_i, pair_info['complete_name'],pair_info['complete_col'],pair_info['r2'], pair_info['m'], pair_info['b'])

In [8]:
#use the best regression values to fill the incomplete dfs

filled_data = {}  # Dictionary to store the filled data

for i, name_i in enumerate(names_incomplete):
    incomplete_df = incomplete_dfs[i]
    incomplete_mask = incomplete_df[incomplete_df.columns].isnull().all(axis=1)
    filled_data[name_i] = incomplete_df.copy()  # Create a copy of the incomplete dataframe

    for col_i in incomplete_df.columns:
        if col_i in selected_pairs[name_i]:
            pair_info = selected_pairs[name_i][col_i]
            name_c = pair_info['complete_name']
            name_c_col = pair_info['complete_col']
            m = pair_info['m']
            b = pair_info['b']

            # Fill the missing values with the best regression values
            filled_data[name_i][col_i][incomplete_mask] = m * complete_dfs[names_complete.index(name_c)][name_c_col][incomplete_mask] + b



In [9]:
#final output complete_dfs+name_complete and filled_data+name_incomplete
for i,name in enumerate(names_complete):
    complete_dfs[i].to_csv(path_output+'SM_2021_'+name+'.csv',index=True,header=True)
    #complete_dfs[i].to_csv(path_analysis+'SM_2021'+name+'.csv',index=True,header=True)

for i,name in enumerate(names_incomplete):
    filled_data[name].to_csv(path_output+'SM_2021_'+name+'.csv',index=True,header=True)
    #filled_data[name].to_csv(path_analysis+'SM_2021'+name+'.csv',index=True,header=True)

In [10]:
#Plot/print the filled datasets
for name_i, filled_df in filled_data.items():
    #print("Filled Data for", name_i)
    comp= filled_df.copy()
    fig = make_subplots(rows=3, cols=1)
    fig.append_trace(go.Scatter(x=comp.index,y=comp['S1'],name="Sensor1_imp"), row=1, col=1)
    fig.append_trace(go.Scatter(x=comp.index,y=comp['S2'],name="Sensor2_imp"), row=2, col=1)
    fig.append_trace(go.Scatter(x=comp.index,y=comp['S3'],name="Sensor3_imp"), row=3, col=1)

    fig.update_layout(height=900, width=1800, title_text="smoisture"+'_'+name_i)
    fig.show()