In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import miceforest as mf

In [2]:
df = pd.read_csv('../../../data/raw/aquifer/petrignano.csv')

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# most data doesn't being until January 09
df = df[df['Date'] > datetime(2008, 12, 31)]

# replace 0s that mean NaN with NaN
df.loc[df[(df['Temperature_Petrignano'] == 0) & (df['Date'] > datetime(2015, 1, 1))].index, 'Temperature_Petrignano'] = float('nan')
df.loc[df[df['Volume_C10_Petrignano'] == 0].index, 'Volume_C10_Petrignano'] = float('nan')
df.loc[df[df['Hydrometry_Fiume_Chiascio_Petrignano'] == 0].index, 'Hydrometry_Fiume_Chiascio_Petrignano'] = float('nan')

# use absolute values where needed
needabs = [
    'Depth_to_Groundwater_P24',
    'Depth_to_Groundwater_P25',
    'Volume_C10_Petrignano'
]
for c in needabs:
    df[c] = df[c].abs()

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

# center & standardize
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('Date', axis=1).values)
df = pd.DataFrame(X, columns=df.drop('Date', axis=1).columns)
df['Date'] = d.reset_index(drop=True)

Dataset 0
1  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petrignano | Hydrometry_Fiume_Chiascio_Petrignano
2  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petrignano | Hydrometry_Fiume_Chiascio_Petrignano
3  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petrignano | Hydrometry_Fiume_Chiascio_Petrignano
Dataset 1
1  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petrignano | Hydrometry_Fiume_Chiascio_Petrignano
2  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petrignano | Hydrometry_Fiume_Chiascio_Petrignano
3  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petrignano | Hydrometry_Fiume_Chiascio_Petrignano
Dataset 2
1  | Depth_to_Groundwater_P24 | Depth_to_Groundwater_P25 | Temperature_Petrignano | Volume_C10_Petri

In [3]:
df.to_csv('../../../data/clean/aquifer/petrignano.csv', index=False)