In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import miceforest as mf

In [2]:
df = pd.read_csv('../../../data/raw/aquifer/luco.csv')

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# target data doesn't start until 2/21/08, ends on 1/12/19
df = df[df['Date'] > datetime(2008, 2, 20)]
df = df[df['Date'] < datetime(2019, 1, 12)]

# some features are missing too many values, just drop them
df = df.drop([
    'Rainfall_Siena_Poggio_al_Vento',
    'Rainfall_Mensano',
    'Rainfall_Ponte_Orgia',
    'Depth_to_Groundwater_Pozzo_1',
    'Depth_to_Groundwater_Pozzo_3',
    'Depth_to_Groundwater_Pozzo_4',
    'Temperature_Siena_Poggio_al_Vento',
    'Temperature_Mensano'
], axis=1)

# replace 0s that mean NaN with NaN
frames = [
    ('Temperature_Pentolina', datetime(2018, 1, 1), -1),
    ('Temperature_Monteroni_Arbia_Biena', datetime(2015, 1, 1), 1)
]
for c, d, dr in frames:
    c1 = df[c] == 0
    c2 = df['Date'] > d if dr > 0 else df['Date'] < d
    df.loc[df[c1 & c2].index, c] = float('nan')

# use absolute values where needed
needabs = [
    'Depth_to_Groundwater_Podere_Casetta',
    'Volume_Pozzo_1',
    'Volume_Pozzo_3',
    'Volume_Pozzo_4'
]
for c in needabs:
    df[c] = df[c].abs()

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

# center & standardize
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('Date', axis=1).values)
df = pd.DataFrame(X, columns=df.drop('Date', axis=1).columns)
df['Date'] = d.reset_index(drop=True)

Dataset 0
1  | Rainfall_Simignano | Rainfall_Montalcinello | Rainfall_Monticiano_la_Pineta | Rainfall_Sovicille | Rainfall_Scorgiano | Rainfall_Pentolina | Rainfall_Monteroni_Arbia_Biena | Depth_to_Groundwater_Podere_Casetta | Temperature_Pentolina | Temperature_Monteroni_Arbia_Biena | Volume_Pozzo_1 | Volume_Pozzo_3 | Volume_Pozzo_4
2  | Rainfall_Simignano | Rainfall_Montalcinello | Rainfall_Monticiano_la_Pineta | Rainfall_Sovicille | Rainfall_Scorgiano | Rainfall_Pentolina | Rainfall_Monteroni_Arbia_Biena | Depth_to_Groundwater_Podere_Casetta | Temperature_Pentolina | Temperature_Monteroni_Arbia_Biena | Volume_Pozzo_1 | Volume_Pozzo_3 | Volume_Pozzo_4
3  | Rainfall_Simignano | Rainfall_Montalcinello | Rainfall_Monticiano_la_Pineta | Rainfall_Sovicille | Rainfall_Scorgiano | Rainfall_Pentolina | Rainfall_Monteroni_Arbia_Biena | Depth_to_Groundwater_Podere_Casetta | Temperature_Pentolina | Temperature_Monteroni_Arbia_Biena | Volume_Pozzo_1 | Volume_Pozzo_3 | Volume_Pozzo_4
Dataset 1
1 

In [3]:
df.to_csv('../../../data/clean/aquifer/luco.csv', index=False)