In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import miceforest as mf

In [2]:
df = pd.read_csv('../../../data/raw/river/arno.csv')

# some of our features are missing loads of datapoints;
# the thinking here is: if the amount of rainfall at a
# certain location hasn't been measured in 13 years,
# are we to expect it will suddenly come into play for
# new predictions? drop these "legacy" features
df = df.drop([
    'Rainfall_Vernio', 'Rainfall_Stia', 'Rainfall_Consuma', 'Rainfall_Incisa',
    'Rainfall_Montevarchi', 'Rainfall_S_Savino', 'Rainfall_Laterina',
    'Rainfall_Bibbiena', 'Rainfall_Camaldoli'
], axis=1)

# dates are in format DD/MM/YYYY
# convert to this explicitly
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# we have temperature data going back to 2000,
# but none of the rainfall data starts until 2004
df = df[df['Date'] > datetime(2003, 12, 31)]

# temperature data only goes through 2017
df = df[df['Date'] < datetime(2017, 3, 9)]

# there are still a few 0s in the target that
# should be null; nullify them
df['Hydrometry_Nave_di_Rosano'] = df['Hydrometry_Nave_di_Rosano'].replace(0, float('nan'))

# use multiple imputation to do single imputation;
# be picky about the date, an invalid datatype
kernel = mf.MultipleImputedKernel(
    data=df.drop('Date', axis=1),
    save_all_iterations=True,
    random_state=143
)
kernel.mice(3, verbose=True)
d = df['Date']
df = kernel.impute_new_data(df.drop('Date', axis=1)).complete_data(0)
df['Date'] = d

Dataset 0
1  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
2  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
3  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
Dataset 1
1  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
2  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
3  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
Dataset 2
1  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
2  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
3  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
Dataset 3
1  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
2  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
3  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
Dataset 4
1  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
2  | Temperature_Firenze | Hydrometry_Nave_di_Rosano
3  | Temperature_Firenze | Hydrometry_Nave_di_Rosano


In [3]:
# center & standardize

# we have to picky about the date, which is an invalid data type
sansdate = df.drop('Date', axis=1)

scaler = StandardScaler()
X = scaler.fit_transform(sansdate.values)
df2 = pd.DataFrame(X, columns=sansdate.columns)

# add the dates back in
df2['Date'] = df['Date'].reset_index(drop=True)

In [4]:
df2.to_csv('../../../data/clean/river/arno.csv', index=False)