In [2]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.selectors as sl
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [3]:
dfs, df = hd.read_samples(["777", "XX7"])
hd.inspect(df)

Time for read_sample    : 1.88 seconds
Time for read_sample    : 3.30 minutes
(7,531,116, 31)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac
0,19510816,2014-03-07,107,9.2,paypal sandrpowell 9001,paypal,spend,services,0.0,gl3 4,2010-05-10,40k to 50k,1984.0,2014-05-11,238820,2020-08-16 12:13:00,barclaycard,credit card,2014-07-18,2017-09-12,True,,paypal,enjoyment,home electronics,home electronics,u,201403,,27530.926758,2.036569
1,19510817,2014-03-08,107,6.94,paypal smartbitsso 9001,paypal,spend,services,0.0,gl3 4,2010-05-10,40k to 50k,1984.0,2014-05-11,238820,2020-08-16 12:13:00,barclaycard,credit card,2014-07-18,2017-09-12,True,,paypal,enjoyment,home electronics,home electronics,u,201403,,27530.926758,2.036569


In [275]:
def income_amount(df, lower=5_000, upper=200_000):
    """Yearly income between 5k and 200k

    Yearly income calculated on rolling basis from
    first month of data.
    """

    def helper(g):
        first_month = g.date.min().strftime("%b")
        yearly_freq = "AS-" + first_month.upper()
        year = pd.Grouper(freq=yearly_freq, key="date")
        yearly_inc = g[g.tag_group.eq("income")].groupby(year).amount.sum().mul(-1)
        return yearly_inc.between(lower, upper).all()

    return df.groupby("user_id").filter(helper)

In [277]:
def income_amount2(df, lower=5_000, upper=200_000):
    """Yearly income between 5k and 200k

    Calc proportionally given months observed

    """

    def helper(g):
        first_month = g.date.min().strftime("%b")
        yearly_freq = "AS-" + first_month.upper()
        year = pd.Grouper(freq=yearly_freq, key="date")
        yearly_inc = g[g.tag_group.eq("income")].groupby(year).amount.sum().mul(-1)
        return yearly_inc.between(lower, upper).all()

    return df.groupby("user_id").filter(helper)

In [278]:
pd.testing.assert_frame_equal(income_amount(dfs), income_amount2(dfs))

In [364]:
data = dfs.drop(columns="income")

In [376]:
%%timeit


def income(df):
    """Adds yearly income for each user.

    To account for years where we don't observe users for the
    full 12 months, we scale yearly income to represent a full
    12 months.
    """
    df = df.copy()
    mask = df.tag_group.str.match("income", na=False)
    yearly_income_payments = (
        df.loc[mask].set_index("date").groupby("user_id").resample("Y")
    )
    yearly_payments_total = yearly_income_payments.amount.sum().mul(-1)
    yearly_unique_months = yearly_income_payments.ym.nunique()
    yearly_income = yearly_payments_total / yearly_unique_months * 12

    yearly_income = (
        yearly_income.rename("income")
        .reset_index()
        .assign(y=lambda df: df.date.dt.year)
        .drop(columns="date")
    )
    df["y"] = df.date.dt.year
    keys = ["user_id", "y"]
    merged = df.merge(yearly_income, how="left", on=keys, validate="m:1")
    return merged.drop(columns="y")


income(data)

113 ms ± 4.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [383]:
%%timeit


def income2(df):
    """
    Adds yearly income for each user.

    Calculated yearly incomes are scaled to 12-month income and multiplied by -1 to get positive numbers (credits are negative in dataset).
    """
    year = df.date.dt.year.rename("year")
    yearly_incomes = (
        df.loc[df.tag_group.eq("income")]
        .groupby(["user_id", year])
        .agg({"amount": "sum", "ym": "nunique"})
        .rename(columns={"amount": "income", "ym": "observed_months"})
        .assign(income=lambda df: df.income / df.observed_months * -12)
        .drop(columns="observed_months")
    )
    return df.merge(
        yearly_incomes,
        left_on=["user_id", year],
        right_on=["user_id", "year"],
        validate="m:1",
    ).drop(columns="year")


income2(data)

41.5 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [369]:
pd.testing.assert_frame_equal(income(data), income2(data))

AssertionError: DataFrame are different

DataFrame shape mismatch
[left]:  (69421, 31)
[right]: (69077, 31)