In [1]:
import pandas as pd
from datetime import datetime
import miceforest as mf
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../../data/raw/aquifer/auser.csv')

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# most data is missing until 2006, and one feature
# doesn't start until 2007
df = df[df['Date'] > datetime(2007, 4, 5)]

# some features are missing many values, just drop them
df = df.drop([
    'Depth_to_Groundwater_DIEC',
    'Depth_to_Groundwater_PAG',
    'Temperature_Ponte_a_Moriano',
    'Volume_CSA',
    'Volume_CSAL',
    'Hydrometry_Piaggione'
], axis=1)

# some features are using 0 when they actually mean NaN
for c in ['Depth_to_Groundwater_LT2', 'Depth_to_Groundwater_SAL', 'Depth_to_Groundwater_CoS']:
    df[c] = df[c].replace(0, float('nan'))

# ACEA specified that only the absolute values should be used
for c in df.columns:
    if c == 'Date':
        continue
    df[c] = df[c].abs()

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

Dataset 0
1  | Rainfall_Monte_Serra | Rainfall_Piaggione | Depth_to_Groundwater_LT2 | Depth_to_Groundwater_SAL | Depth_to_Groundwater_CoS | Hydrometry_Monte_S_Quirico
2  | Rainfall_Monte_Serra | Rainfall_Piaggione | Depth_to_Groundwater_LT2 | Depth_to_Groundwater_SAL | Depth_to_Groundwater_CoS | Hydrometry_Monte_S_Quirico
3  | Rainfall_Monte_Serra | Rainfall_Piaggione | Depth_to_Groundwater_LT2 | Depth_to_Groundwater_SAL | Depth_to_Groundwater_CoS | Hydrometry_Monte_S_Quirico
Dataset 1
1  | Rainfall_Monte_Serra | Rainfall_Piaggione | Depth_to_Groundwater_LT2 | Depth_to_Groundwater_SAL | Depth_to_Groundwater_CoS | Hydrometry_Monte_S_Quirico
2  | Rainfall_Monte_Serra | Rainfall_Piaggione | Depth_to_Groundwater_LT2 | Depth_to_Groundwater_SAL | Depth_to_Groundwater_CoS | Hydrometry_Monte_S_Quirico
3  | Rainfall_Monte_Serra | Rainfall_Piaggione | Depth_to_Groundwater_LT2 | Depth_to_Groundwater_SAL | Depth_to_Groundwater_CoS | Hydrometry_Monte_S_Quirico
Dataset 2
1  | Rainfall_Monte_Serra | 

In [3]:
# center & standardize

# we have to picky about the date, which is an invalid data type
sansdate = df.drop('Date', axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(sansdate.values)
df2 = pd.DataFrame(X, columns=sansdate.columns)

# add the dates back in
df2['Date'] = df['Date'].reset_index(drop=True)

In [4]:
df2.to_csv('../../data/clean/aquifer/auser.csv', index=False)