In [1]:
import pandas as pd

In [2]:
def convert_date(df, column):
    df[column] = df[column].apply(lambda d: dateutil.parser.parse(d))

def convert_to_int(df, column):
    df[column] = df[column].astype(int)

In [3]:
def find_columns(df, prefix, sep='_'):
    cols = [x for x in df.columns.values if x.startswith(prefix+sep)]
    if len(cols) == 0:
        raise Exception('No columns found with prefix ' + prefix)
    return cols

In [4]:
def extract_unknown(df, column, sep='_', replace = -1):
    df.insert(
        df.columns.get_loc(column),
        column + sep + 'known',
        df[column].apply(lambda d: 0 if pd.isna(d) else 1)
    )
    df[column] = df[column].fillna(replace)

In [5]:
def split_categories(df, column, values, sep='_'):
    # get_dummies is not reliable; it will not include columns if no row contains the value.
    # hand-craft it instead:
    pos = df.columns.get_loc(column)
    col = df[column]
    df.drop(columns=[column], inplace=True)
    for v in values:
        df.insert(pos, column + sep + v, col.apply(lambda d: 1 if (str(d) == str(v)) else 0))
        pos += 1

In [6]:
def _merge_categories(row, prefix, checks):
    for c in checks:
        if row[c]:
            return c[len(prefix) + 1:]
    return 'N/A'

def merge_categories(df, prefix, sep='_'):
    checks = find_columns(df, prefix, sep=sep)
    outDf = df.drop(columns = checks)
    outDf[prefix] = df.apply(lambda row: _merge_categories(row, prefix, checks), axis=1)
    return outDf

In [7]:
def _count_hots(row, prefix, checks):
    n = 0
    for c in checks:
        if row[c]:
            n += 1
    return n

def count_hots(df, prefix, sep='_'):
    checks = find_columns(df, prefix, sep=sep)
    return df.apply(lambda row: _count_hots(row, prefix, checks), axis=1)

In [8]:
def drop_columns_prefixed(df, prefix, sep='_'):
    return df.drop(columns = find_columns(df, prefix, sep=sep))

In [9]:
def divide_data(df, fractions):
    remaining = df
    result = []
    fracRemaining = 1
    for fraction in fractions:
        batch = remaining.sample(frac=fraction / fracRemaining)
        result.append(batch)
        remaining = remaining.drop(batch.index)
        fracRemaining -= fraction
    return result

In [10]:
def display_wide(df):
    with pd.option_context('display.max_columns', None):
        display(df)