In [1]:
'''
    This notebooks contains data pipeline for processing given data frame and searching for outliers.
    Remark 1. I've not tried to achieve the maximum performance and expressed pipeline in 
    functional style (a set of functions with immutable arguments).
    Remark 2. I've not tried to dive deep into physical meaning of the given data frame and selected
    all the numerical columns to find the outliers.
    Remark 3. There are multiple approaches to find the anomalies in data and definte the outliers.
    I've chosed the most familiar one -- to consider outliers as values that satisfy 3*sigma rule
    (lie in the tail of the normal distribution). Searching the outliers in the case of multiple 
    columns is reduced to the single column case by mean of the normalized values of thin columns.
'''
import numpy, pandas
from scipy import stats

In [2]:
def load(fname):
    '''
    Loads dataframe with name 'fname' from hard drive and returns it.
    '''
    output = pandas.read_csv(fname, index_col=0, parse_dates=True)
    return output

In [3]:
def parse(input, columns):
    '''
    Parses the dataframe 'input' by selecting the given column names 'columns' and converting them to floats
    '''
    to_float = lambda val: float(val[1:].replace(',', ''))
    numeric = input[columns]
    output = numeric.copy()
    for column in output.columns:
        output[column] = numeric[column].apply(to_float)
    return output

In [4]:
def normlz(input):
    '''
        Normalizes the dataframe 'input' and return it.
    '''
    output = input.copy()
    for column in input.columns:
        max_value = input[column].max()
        min_value = input[column].min()
        output[column] = (input[column] - min_value) / (max_value - min_value)
    return output

In [5]:
def sieve(input):
    aggred = input.mean(axis=1)
    zscores = numpy.abs(stats.zscore(aggred))
    output = numpy.where(zscores > 3)[0]
    return output

In [None]:
def run():
    '''
    Executes the pipeline code that consist of the following steps:
    1. definition numerical columns list with dollar sign prefixed values where the outliers will be searched for;
    2. loading the dataframe from hard drive;
    3. parsing the dataframe by selecting the given column and converting the values to float data type;
    4. getting the outlier indices by applying sieve function;
    5. saving subdataframe with outlier rows from the original dataframe to hard drive.
    '''
    columns = ['plan_premium', 'reinsurance', 'rx', 'rx_with_rebates', 'rx_without_rebates', 'spec_cap']
    loaded = load('data1.csv')
    parsed = parse(loaded, columns)
    normlzed = normlz(parsed)
    outliers = sieve(normlzed)
    loaded.iloc[outliers].to_csv('outliers.csv')