In [32]:
import pickle
import numpy as np
import pandas as pd

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [121]:
def _preprocess(df, drop_cols:list=None):
    columns=['datetime', 'expt_num', 'sitename', 'type', 'index', 'value']
    
    try:
        exp_num = df['expt_num'][0]
        sitename = df['sitename'][0]
        df.pop(['expt_num', 'sitename'], axis=1, inplace=True)

    except:
        print("Already dropped")
        
    
    # get unique sensor types
    sensor_types = pd.unique(df['type'])
    DATA_PER_SENSOR = sum(df['type'] == 'solution_pH')

    for type in sensor_types:
        mask = df['type'] == type
        print(f"{type}: {df[mask].shape[0] / DATA_PER_SENSOR:.2f} sensors")
        
    # construct sensor dict
    sensor_dict = {}
    for typ in sensor_types:
        _sensor = df.loc[df['type'] == typ] # select what type of sensor

        for ind in pd.unique(_sensor['index']): # select ith sensor
            sensor_dict[f'{typ}_{ind}'] = _sensor.loc[_sensor['index'] == ind]['value'].values
            #print(f"{typ}_{ind}: {_sensor.loc[_sensor['index'] == ind]['value'].isna().sum()} nan values")
            
    # construct df
    sensor_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in sensor_dict.items() ]))
    
    if drop_cols is not None:
        for col in drop_cols:
            sensor_df.pop(col) # drop column 
            print(f"{col} dropped")
    
    
    if True in sensor_df.isna().any().values: # check if there are NaN values
        sensor_df = sensor_df.interpolate(axis=0) # interpolate
        print("NaN values detected, interpolation applied")
    
    
    # average every instance of a sensor type
    temp_cols = [col for col in sensor_df.columns if "temperature" in col]
    humid_cols = [col for col in sensor_df.columns if "humidity" in col]
    li_cols = [col for col in sensor_df.columns if "light_intensity" in col]
    solution_EC = [col for col in sensor_df.columns if "solution_EC" in col]

    ph_cols = [col for col in sensor_df.columns if "solution_pH" in col]
    sm_cols = [col for col in sensor_df.columns if "soil_moisture" in col]

    ave_list = []

    temp_ave = sensor_df[temp_cols].mean(axis=1) # apply mean across column wise
    humid_ave = sensor_df[humid_cols].mean(axis=1)
    li_ave = sensor_df[li_cols].mean(axis=1)
    sm_ave = sensor_df[sm_cols].mean(axis=1)

    ave_list.append(temp_ave)
    ave_list.append(humid_ave)
    ave_list.append(li_ave)
    ave_list.append(sensor_df[solution_EC].squeeze())

    ave_list.append(sensor_df[ph_cols].squeeze())
    ave_list.append(sm_ave)

    ave_cols = ['temp_ave', 'humid_ave', 'li_ave', 'EC', 'ph', 'sm_ave']

    ave_dict = {}

    for k,v in zip(ave_cols, ave_list):
        ave_dict[k] = v


    ave_df = pd.DataFrame.from_dict(ave_dict)
    
    return ave_df

filename = "model_ph.sav"
loaded_model = pickle.load(open(filename, 'rb'))

x = np.random.randn(1,4)
loaded_model.predict(x) # test using random vars

In [124]:
import joblib
filename = 'model_ph.sav'
loaded_model = joblib.load(filename)

## deployment

In [125]:
columns=['datetime', 'expt_num', 'sitename', 'type', 'index', 'value']
df = pd.read_csv("Large_sensors.csv", names=columns)

df.head()

Unnamed: 0,datetime,expt_num,sitename,type,index,value
0,02/14/2023 20:30,0,DLSU-BLAST,temperature,0,28.6
1,02/14/2023 20:30,0,DLSU-BLAST,humidity,0,55.2
2,02/14/2023 20:30,0,DLSU-BLAST,temperature,1,29.2
3,02/14/2023 20:30,0,DLSU-BLAST,humidity,1,62.6
4,02/14/2023 20:30,0,DLSU-BLAST,light_intensity,0,378.75


In [127]:
sensor_ave_df = _preprocess(df, drop_cols=['light_intensity_7'])
sensor_ave_df.head()

Already dropped
temperature: 2.00 sensors
humidity: 2.00 sensors
light_intensity: 9.00 sensors
soil_moisture: 9.00 sensors
solution_pH: 1.00 sensors
solution_EC: 1.00 sensors
light_intensity_7 dropped
NaN values detected, interpolation applied


Unnamed: 0,temp_ave,humid_ave,li_ave,EC,ph,sm_ave
0,28.9,58.9,401.822917,2.628156,3.238167,0.984269
1,26.85,58.7,403.020833,2.447942,3.237843,0.98288
2,26.65,60.85,409.53125,2.416456,3.237111,0.982002
3,24.7,72.0,0.052083,0.828667,3.23028,0.959288
4,24.8,71.7,432.083333,0.818954,3.237111,0.969119


In [62]:
try:
    exp_num = df['expt_num'][0]
    sitename = df['sitename'][0]
    df.pop(['expt_num', 'sitename'], axis=1, inplace=True)

except:
    print("Already dropped")

Already dropped


In [72]:
sensor_types = pd.unique(df['type'])
DATA_PER_SENSOR = sum(df['type'] == 'solution_pH')

for type in sensor_types:
    mask = df['type'] == type
    print(f"{type}: {df[mask].shape[0] / DATA_PER_SENSOR:.2f} sensors")

temperature: 2.00 sensors
humidity: 2.00 sensors
light_intensity: 9.00 sensors
soil_moisture: 9.00 sensors
solution_pH: 1.00 sensors
solution_EC: 1.00 sensors


In [73]:
DATA_PER_SENSOR = sum(df['type'] == 'solution_pH')

for type in sensor_types:
    mask = df['type'] == type
    print(f"{type}: {df[mask].shape[0] / DATA_PER_SENSOR:.2f} sensors")

temperature: 2.00 sensors
humidity: 2.00 sensors
light_intensity: 9.00 sensors
soil_moisture: 9.00 sensors
solution_pH: 1.00 sensors
solution_EC: 1.00 sensors


In [77]:
sensor_dict = {}
for typ in sensor_types:
    _sensor = df.loc[df['type'] == typ] # select what type of sensor
    
    for ind in pd.unique(_sensor['index']): # select ith sensor
        sensor_dict[f'{typ}_{ind}'] = _sensor.loc[_sensor['index'] == ind]['value'].values
        print(f"{typ}_{ind}: {_sensor.loc[_sensor['index'] == ind]['value'].isna().sum()} nan values")

temperature_0: 1 nan values
temperature_1: 0 nan values
humidity_0: 1 nan values
humidity_1: 0 nan values
light_intensity_0: 547 nan values
light_intensity_1: 158 nan values
light_intensity_2: 158 nan values
light_intensity_3: 189 nan values
light_intensity_4: 158 nan values
light_intensity_5: 158 nan values
light_intensity_6: 547 nan values
light_intensity_7: 654 nan values
light_intensity_8: 473 nan values
soil_moisture_0: 0 nan values
soil_moisture_1: 0 nan values
soil_moisture_2: 0 nan values
soil_moisture_3: 0 nan values
soil_moisture_4: 0 nan values
soil_moisture_5: 0 nan values
soil_moisture_6: 0 nan values
soil_moisture_7: 0 nan values
soil_moisture_8: 0 nan values
solution_pH_0: 0 nan values
solution_EC_0: 0 nan values


In [79]:
#sensor_df = pd.DataFrame.from_dict(data=sensor_dict)
# since the dict values dont have equal length, use this instead
sensor_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in sensor_dict.items() ]))
sensor_df.pop('light_intensity_7') # drop column containing all NaNs

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
649   NaN
650   NaN
651   NaN
652   NaN
653   NaN
Name: light_intensity_7, Length: 654, dtype: float64

In [99]:
if True in sensor_df.isna().any().values: # check if there are NaN values
    sensor_df = sensor_df.interpolate(axis=0) # interpolate

In [103]:
# average every instance of a sensor type
temp_cols = [col for col in sensor_df.columns if "temperature" in col]
humid_cols = [col for col in sensor_df.columns if "humidity" in col]
li_cols = [col for col in sensor_df.columns if "light_intensity" in col]
solution_EC = [col for col in sensor_df.columns if "solution_EC" in col]

ph_cols = [col for col in sensor_df.columns if "solution_pH" in col]
sm_cols = [col for col in sensor_df.columns if "soil_moisture" in col]

ave_list = []

temp_ave = sensor_df[temp_cols].mean(axis=1) # apply mean across column wise
humid_ave = sensor_df[humid_cols].mean(axis=1)
li_ave = sensor_df[li_cols].mean(axis=1)
sm_ave = sensor_df[sm_cols].mean(axis=1)

ave_list.append(temp_ave)
ave_list.append(humid_ave)
ave_list.append(li_ave)
ave_list.append(sensor_df[solution_EC].squeeze())

ave_list.append(sensor_df[ph_cols].squeeze())
ave_list.append(sm_ave)

ave_cols = ['temp_ave', 'humid_ave', 'li_ave', 'EC', 'ph', 'sm_ave']

ave_dict = {}

for k,v in zip(ave_cols, ave_list):
    ave_dict[k] = v
    

ave_df = pd.DataFrame.from_dict(ave_dict)
ave_df.head()

Unnamed: 0,temp_ave,humid_ave,li_ave,EC,ph,sm_ave
0,28.9,58.9,401.822917,2.628156,3.238167,0.984269
1,26.85,58.7,403.020833,2.447942,3.237843,0.98288
2,26.65,60.85,409.53125,2.416456,3.237111,0.982002
3,24.7,72.0,0.052083,0.828667,3.23028,0.959288
4,24.8,71.7,432.083333,0.818954,3.237111,0.969119


In [108]:
X = ave_df.iloc[:,:4].values
X.shape

(654, 4)

In [112]:
loaded_model.predict(X[-1].reshape(-1,4))

array([3.59734133])

In [114]:
loaded_model.predict(X[0:4].reshape(-1,4)) # get latest temp, humid, li, and EC for extrapolation

array([3.32140264, 3.33826528, 3.3486016 , 3.33006431])