In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import miceforest as mf

In [2]:
df = pd.read_csv('../../../data/raw/aquifer/doganella.csv')

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# target data does not exist prior to 2012
df = df[df['Date'] > datetime(2012, 5, 31)]

# only use absolute value of targets
targs = [f"Depth_to_Groundwater_Pozzo_{x+1}" for x in range(9)]
for c in targs:
    df[c] = df[c].abs()

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

# center & standardize
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('Date', axis=1).values)
df = pd.DataFrame(X, columns=df.drop('Date', axis=1).columns)
df['Date'] = d.reset_index(drop=True)

Dataset 0
1  | Rainfall_Monteporzio | Rainfall_Velletri | Depth_to_Groundwater_Pozzo_1 | Depth_to_Groundwater_Pozzo_2 | Depth_to_Groundwater_Pozzo_3 | Depth_to_Groundwater_Pozzo_4 | Depth_to_Groundwater_Pozzo_5 | Depth_to_Groundwater_Pozzo_6 | Depth_to_Groundwater_Pozzo_7 | Depth_to_Groundwater_Pozzo_8 | Depth_to_Groundwater_Pozzo_9 | Volume_Pozzo_1 | Volume_Pozzo_2 | Volume_Pozzo_3 | Volume_Pozzo_4 | Volume_Pozzo_5+6 | Volume_Pozzo_7 | Volume_Pozzo_8 | Volume_Pozzo_9 | Temperature_Monteporzio | Temperature_Velletri
2  | Rainfall_Monteporzio | Rainfall_Velletri | Depth_to_Groundwater_Pozzo_1 | Depth_to_Groundwater_Pozzo_2 | Depth_to_Groundwater_Pozzo_3 | Depth_to_Groundwater_Pozzo_4 | Depth_to_Groundwater_Pozzo_5 | Depth_to_Groundwater_Pozzo_6 | Depth_to_Groundwater_Pozzo_7 | Depth_to_Groundwater_Pozzo_8 | Depth_to_Groundwater_Pozzo_9 | Volume_Pozzo_1 | Volume_Pozzo_2 | Volume_Pozzo_3 | Volume_Pozzo_4 | Volume_Pozzo_5+6 | Volume_Pozzo_7 | Volume_Pozzo_8 | Volume_Pozzo_9 | Temperature_M

In [3]:
df.to_csv('../../../data/clean/aquifer/doganella.csv', index=False)