In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from fancyimpute import KNN

Using TensorFlow backend.


In [2]:
    min_date = "2018-01-01 00:00:00"
    max_date = "2018-01-02 00:00:00"
    date_range = pd.date_range(start=min_date, end=max_date, freq="1H")

In [3]:
date_range

DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00',
               '2018-01-01 02:00:00', '2018-01-01 03:00:00',
               '2018-01-01 04:00:00', '2018-01-01 05:00:00',
               '2018-01-01 06:00:00', '2018-01-01 07:00:00',
               '2018-01-01 08:00:00', '2018-01-01 09:00:00',
               '2018-01-01 10:00:00', '2018-01-01 11:00:00',
               '2018-01-01 12:00:00', '2018-01-01 13:00:00',
               '2018-01-01 14:00:00', '2018-01-01 15:00:00',
               '2018-01-01 16:00:00', '2018-01-01 17:00:00',
               '2018-01-01 18:00:00', '2018-01-01 19:00:00',
               '2018-01-01 20:00:00', '2018-01-01 21:00:00',
               '2018-01-01 22:00:00', '2018-01-01 23:00:00',
               '2018-01-02 00:00:00'],
              dtype='datetime64[ns]', freq='H')

In [4]:
date_range = pd.DataFrame({"timestamp": date_range})
weather_imputed = pd.DataFrame(columns=["timestamp", "site_id"])

In [5]:
# Create perfect timeline without missing hours
for site in range(2):
    date_range["site_id"] = site
    weather_imputed = weather_imputed.append(date_range)

In [6]:
weather_imputed

Unnamed: 0,timestamp,site_id
0,2018-01-01 00:00:00,0
1,2018-01-01 01:00:00,0
2,2018-01-01 02:00:00,0
3,2018-01-01 03:00:00,0
4,2018-01-01 04:00:00,0
5,2018-01-01 05:00:00,0
6,2018-01-01 06:00:00,0
7,2018-01-01 07:00:00,0
8,2018-01-01 08:00:00,0
9,2018-01-01 09:00:00,0


In [10]:

def reduce_mem_usage(df, verbose=True):
    """
    Takes an dataframe as argument and adjusts the datatypes of the respective
    columns to reduce memory allocation
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if (c_min > np.iinfo(np.int8).min and
                        c_max < np.iinfo(np.int8).max):
                    df[col] = df[col].astype(np.int8)
                elif (c_min > np.iinfo(np.int16).min and
                      c_max < np.iinfo(np.int16).max):
                    df[col] = df[col].astype(np.int16)
                elif (c_min > np.iinfo(np.int32).min and
                      c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min and
                      c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min and
                        c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min and
                      c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    reduced_mem = 100 * (start_mem - end_mem) / start_mem
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'
              .format(end_mem, reduced_mem))
    return df

In [11]:
def impute_weather_data(data_frame):
    data_frame["timestamp"] = pd.to_datetime(data_frame["timestamp"])
    min_date = data_frame["timestamp"].dropna().min()
    max_date = data_frame["timestamp"].dropna().max()
    date_range = pd.date_range(start=min_date, end=max_date, freq="1H")
    date_range = pd.to_datetime(date_range)
    date_range = pd.DataFrame({"timestamp": date_range})
    weather_imputed = pd.DataFrame(columns=["timestamp", "site_id"])

    # Create perfect timeline without missing hours
    for site in data_frame["site_id"].unique():
        date_range["site_id"] = site
        weather_imputed = weather_imputed.append(date_range)

    # Join with existing weather data
    weather_imputed = weather_imputed.merge(data_frame, left_on=["site_id", "timestamp"], right_on=["site_id", "timestamp"],
                                  how="left")

    # Preserve data_frame data before transforming
    weather_cols = weather_imputed.columns.values
    weather_timestamp = weather_imputed["timestamp"]
    weather_site_ids = weather_imputed["site_id"]

    # Scale data for KNN
    date_delta = pd.datetime.now() - weather_imputed["timestamp"]
    weather_imputed["timestamp"] = date_delta.dt.total_seconds()
    scaler = StandardScaler()
    weather_imputed = scaler.fit_transform(weather_imputed)

    # Impute missing values
    weather_imputed = KNN(5).fit_transform(weather_imputed)

    # Rescale
    weather_imputed = scaler.inverse_transform(weather_imputed)

    # Assemble final weather frame
    weather_final = pd.DataFrame(data=weather_imputed, columns=weather_cols)
    weather_final["timestamp"] = weather_timestamp
    weather_final["site_id"] = weather_site_ids

    return weather_final

In [8]:
weather = pd.read_csv("../data/raw/weather_train.csv")

In [12]:
impute_weather_data(weather)

AttributeError: 'numpy.ndarray' object has no attribute 'memory_usage'