Function that takes in a dataframe, removes outliers, problably normalize and scale data.

Functions are set up in such a way where we should be able to batch pre-process the data without reading in the entire data set at once

In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from collections import namedtuple

In [2]:
# Set types
# https://gis.stackexchange.com/questions/8650/measuring-accuracy-of-latitude-and-longitude/8674#8674
# Floats have 7 decimal digits of precision, doubles 15. You only need around 7 for lat and long 
types_dict = {
        "fare_amount": np.float16,
        "pickup_longitude": np.float32, 
        "pickup_latitude": np.float32, 
        "dropoff_longitude": np.float32, 
        "dropoff_latitude": np.float32, 
        "passenger_count": np.int8,
#         "pickup_datetime": np.int8,
#         "pickup_datetime": np.datetime64[ns],
        "pickup_year": np.float32,
        "pickup_hour": np.float32,
}

In [3]:
def _remove_outliers(df):
    """Takes in an unprocessed df and returns a df that has outliers removed"""
    ## Remove obvious outliers first
    # where lat/long == 0
    df = df.drop(df.index[df['pickup_longitude'] == 0])
    df = df.drop(df.index[df['dropoff_longitude'] == 0])
    # Where fares < 0, becuase that makes no sense
    df = df.drop(df.index[df['fare_amount'] < 0])
    # Realistically, if we want to keep the trips within manhattan, pickup long should be between -72 and -75 and lat between 40 and 42
    # as everything else is an extreme outlier
    df = df.drop(df.index[df['pickup_longitude'] > -72])
    df = df.drop(df.index[df['pickup_longitude'] < -75])
    df = df.drop(df.index[df['pickup_latitude'] > 42])
    df = df.drop(df.index[df['pickup_latitude'] < 40])
    # Visualizations show the same thing with dropof longs and lats
    df = df.drop(df.index[df['dropoff_longitude'] > -72])
    df = df.drop(df.index[df['dropoff_longitude'] < -75])
    df = df.drop(df.index[df['dropoff_latitude'] > 42])
    df = df.drop(df.index[df['dropoff_latitude'] < 40])
    return df

In [4]:
# we're assuming that a df of the right types is being passed in
data = namedtuple('data', 'training_data fares_data')

def preprocess(df,types=types_dict):
    """Takes in an unprocessed df and returns a tuple df that has outliers removed and is preprocessed"""
    # Drop NA values
    df = df.dropna()
    # Remove obvious outliers first
    df = _remove_outliers(df)
    # Key isnt useful
    del df["key"]
    # Currently giving categorial numbers, but something like onehotencoding might work better
    # df["pickup_datetime"] = df["pickup_datetime"].apply(lambda d:int(d[10:13]))
    date_format_string = "%Y-%m-%d %H:%M:%S UTC" 
    df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"].iloc[:10000000], format=date_format_string)
    df["pickup_year"] = df["pickup_datetime"].apply(lambda t: t.year + t.dayofyear/365) 
    df["pickup_hour"] = df["pickup_datetime"].apply(lambda t: t.hour + t.minute/(60) + t.second/(60*60))
    del df["pickup_datetime"]
    # Set types as we batch read in
    for name, t in types_dict.items():
        df[name] = df[name].astype(t)
    fares = df["fare_amount"]
    del df["fare_amount"]
    # TODO - Scale data. Requires us to find min and max of entire dataset, not sure how to proceed since we're batching
    # TODO - Normalize data, same issues as above and batching.
    # TODO - Possibly PCA, but seems l'ike all of the features we get should be relevant to the task at hand
    return data(training_data=df, fares_data=fares)

In [5]:
def normalize(df):
    '''Takes in a df of similar format to the original dataset and normalizes using a pickled normalizer trained on the entire dataset'''
    try:
        n = joblib.load('../../models/normalizer.joblib')
    except:
        print("Cannot load normalizer model, does it exist?")
        return None
    return pd.DataFrame(n.transform(df), columns=list(df))

In [6]:
def scale(df):
    '''Takes in a df of similar format to the original dataset and scales using a pickled scaler trained on the entire dataset'''
    try:
        n = joblib.load('../../models/minmaxscaler.joblib')
    except:
        print("Cannot load scaler model, does it exist?")
        return None
    return pd.DataFrame(n.transform(df), columns=list(df))

As we add new features to our dataset, we will need to normalize and scale those as well. However, we can't just use our already pickled normalizer and scaler as those were trained on the original dataset and not the new columns we are trying to add.

In [7]:
def normalize_new_feature(series, filename, type_of):
#     try:
#         n = joblib.load('../../models/{}_normalizer.joblib'.format(series.name))
#     except:
#     scaler = MinMaxScaler().fit(series)
    pass

In [8]:
def scale_new_feature(df):
    pass