In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import miceforest as mf

In [2]:
df = pd.read_csv('../../../data/raw/waterspring/amiata.csv')

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# target doesn't begin until 2015
df = df[df['Date'] > datetime(2014, 12, 31)]

# replace 0s that mean NaN with NaN
cols = [
    'Flow_Rate_Bugnano',
    'Flow_Rate_Arbure',
    'Flow_Rate_Ermicciolo'
]
for c in cols:
    df.loc[df[c] == 0, c] = float('nan')

# use absolute values where needed
needabs = [
    'Depth_to_Groundwater_S_Fiora_8',
    'Depth_to_Groundwater_S_Fiora_11bis',
    'Depth_to_Groundwater_David_Lazzaretti',
    'Flow_Rate_Bugnano',
    'Flow_Rate_Arbure',
    'Flow_Rate_Ermicciolo',
    'Flow_Rate_Galleria_Alta'
]
for c in needabs:
    df[c] = df[c].abs()

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

# center & standardize
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('Date', axis=1).values)
df = pd.DataFrame(X, columns=df.drop('Date', axis=1).columns)
df['Date'] = d.reset_index(drop=True)

Dataset 0
1  | Rainfall_Abbadia_S_Salvatore | Rainfall_S_Fiora | Rainfall_Laghetto_Verde | Rainfall_Vetta_Amiata | Depth_to_Groundwater_S_Fiora_8 | Depth_to_Groundwater_S_Fiora_11bis | Depth_to_Groundwater_David_Lazzaretti | Temperature_Abbadia_S_Salvatore | Temperature_Laghetto_Verde | Flow_Rate_Bugnano | Flow_Rate_Arbure | Flow_Rate_Ermicciolo
2  | Rainfall_Abbadia_S_Salvatore | Rainfall_S_Fiora | Rainfall_Laghetto_Verde | Rainfall_Vetta_Amiata | Depth_to_Groundwater_S_Fiora_8 | Depth_to_Groundwater_S_Fiora_11bis | Depth_to_Groundwater_David_Lazzaretti | Temperature_Abbadia_S_Salvatore | Temperature_Laghetto_Verde | Flow_Rate_Bugnano | Flow_Rate_Arbure | Flow_Rate_Ermicciolo
3  | Rainfall_Abbadia_S_Salvatore | Rainfall_S_Fiora | Rainfall_Laghetto_Verde | Rainfall_Vetta_Amiata | Depth_to_Groundwater_S_Fiora_8 | Depth_to_Groundwater_S_Fiora_11bis | Depth_to_Groundwater_David_Lazzaretti | Temperature_Abbadia_S_Salvatore | Temperature_Laghetto_Verde | Flow_Rate_Bugnano | Flow_Rate_Arbure

In [3]:
df.to_csv('../../../data/clean/waterspring/amiata.csv', index=False)