In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import miceforest as mf

In [2]:
df = pd.read_csv('../../../data/raw/waterspring/lupa.csv')

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# target doesn't begin until Feb
df = df[df['Date'] > datetime(2009, 2, 18)]

# replace 0s that mean NaN with NaN
df.loc[df['Flow_Rate_Lupa'] == 0, 'Flow_Rate_Lupa'] = float('nan')

# use absolute value
df['Flow_Rate_Lupa'] = df['Flow_Rate_Lupa'].abs()

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

# center & standardize
scaler = StandardScaler()
X = scaler.fit_transform(df.drop('Date', axis=1).values)
df = pd.DataFrame(X, columns=df.drop('Date', axis=1).columns)
df['Date'] = d.reset_index(drop=True)

Dataset 0
1  | Flow_Rate_Lupa
2  | Flow_Rate_Lupa
3  | Flow_Rate_Lupa
Dataset 1
1  | Flow_Rate_Lupa
2  | Flow_Rate_Lupa
3  | Flow_Rate_Lupa
Dataset 2
1  | Flow_Rate_Lupa
2  | Flow_Rate_Lupa
3  | Flow_Rate_Lupa
Dataset 3
1  | Flow_Rate_Lupa
2  | Flow_Rate_Lupa
3  | Flow_Rate_Lupa
Dataset 4
1  | Flow_Rate_Lupa
2  | Flow_Rate_Lupa
3  | Flow_Rate_Lupa


In [3]:
df.to_csv('../../../data/clean/lupa.csv', index=False)