In [1]:
import contextlib
import os
import re
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import tabulate

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.data.make_data as md
import entropy.data.selectors as sl
import entropy.data.validators as vl
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [11]:
df = pd.DataFrame(
    {
        "date": [0, 1, 2, 3, 4, 5, 6, 7],
        "inc": [np.nan, np.nan, 5, np.nan, np.nan, np.nan, 4, np.nan],
    }
)

df

Unnamed: 0,date,inc
0,0,
1,1,
2,2,5.0
3,3,
4,4,
5,5,
6,6,4.0
7,7,


In [9]:
df["cumsum"] = df.inc.notna().cumsum()
df

Unnamed: 0,date,inc,cumsum
0,0,False,1
1,1,False,2
2,2,True,3
3,3,False,4
4,4,False,5
5,5,False,6
6,6,True,7
7,7,False,8


In [None]:
dfs, dfl = hd.read_samples(["777", "XX7"])
hd.inspect(dfl)

Time for read_sample    : 5.04 seconds


Entropy analysis data

In [3]:
filepath = "/Users/fgu/tmp/en/analysis_data_XX7.parquet"
dfa = ha.read_parquet(filepath)
hd.inspect(dfa)

(163,915, 23)


Unnamed: 0_level_0,Unnamed: 1_level_0,obs,balance_ca,balance_sa,sa_inflows,sa_outflows,sa_net_inflows,sa_scaled_inflows,sa_scaled_outflows,sa_scaled_net_inflows,total_monthly_spend,tag_spend_household,tag_spend_hobbies,tag_spend_retail,tag_spend_services,tag_spend_other_spend,tag_spend_finance,tag_spend_travel,tag_spend_communication,tag_spend_motor,entropy_sptac,log_income,user_female,age
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
57,2012-04-30,66,0.0,,,,,,,,6.784095,0.400869,0.011305,0.00937,0.245824,0.067898,0.000837,0.228929,0.034967,0.0,2.425209,10.023162,0.0,25.0
57,2012-05-31,75,0.0,,,,,,,,6.83729,0.469773,0.010719,0.0,0.317825,-0.014239,0.001105,0.148343,0.066473,0.0,2.363752,10.023162,0.0,25.0


In [6]:
fs.ls("3di-data-mdb/raw")

['3di-data-mdb/raw/',
 '3di-data-mdb/raw/20200630_UserLoginsForNeedham.csv',
 '3di-data-mdb/raw/mdb_000.csv',
 '3di-data-mdb/raw/mdb_000.parquet',
 '3di-data-mdb/raw/mdb_777.csv',
 '3di-data-mdb/raw/mdb_777.parquet',
 '3di-data-mdb/raw/mdb_X77.csv',
 '3di-data-mdb/raw/mdb_X77.parquet',
 '3di-data-mdb/raw/mdb_XX7.csv',
 '3di-data-mdb/raw/mdb_XX7.parquet',
 '3di-data-mdb/raw/mdb_costa.csv']

### MDB user login data

In [93]:
@hh.timer
def read_logins(nrows=None):
    fp = "s3://3di-data-mdb/raw/20200630_UserLoginsForNeedham.csv"
    return ha.read_csv(fp, nrows=nrows, names=["user_id", "date"], parse_dates=["date"])


login_data = read_logins()
hd.inspect(login_data)

Time for read_logins    : 37.96 seconds
(7,521,416, 2)


Unnamed: 0,user_id,date
0,1,2015-07-23 14:28:00
1,1,2015-07-27 05:43:00


Proportion of users in dataset with login info

In [94]:
len(set(dfl.user_id) & set(login_data.user_id)) / dfl.user_id.nunique()

0.8884974458340673

Merge login data into transaction data

In [132]:
def make_daily_logins(login_data):
    login_data = login_data.copy()
    login_data["date"] = login_data.date.dt.round("d")
    login_data["logins"] = 1
    return login_data.groupby(["user_id", "date"]).logins.sum().reset_index()


def merge_logins(txn_data, login_data):
    txn_data = txn_data.copy()
    login_data = login_data.copy()
    merged = txn_data.merge(
        login_data, on=["user_id", "date"], how="left", validate="m:1"
    )
    merged["logins"] = merged.logins.fillna(0)
    return merged


def drop_pre_registration_data(df):
    """Drop all transactions from before a user registered with MDB.

    MDB receives up to three years of data after a user signs up. As
    a user can obviously not log in to their MDB account during that
    time, we drop these transactions.
    """
    return df[df.date >= df.user_registration_date]


txn_data = dfl
daily_logins = make_daily_logins(logins)
post_registration_txns = drop_pre_registration_data(txn_data)
df = merge_logins(post_registration_txns, daily_logins)
hd.inspect(df)

(9,539,059, 32)


Unnamed: 0,id,date,user_id,amount,desc,merchant,tag_group,tag,user_female,user_postcode,user_registration_date,user_salary_range,user_yob,account_created,account_id,account_last_refreshed,account_provider,account_type,data_warehouse_date_created,data_warehouse_date_last_updated,debit,latest_balance,merchant_business_line,tag_auto,tag_manual,tag_up,updated_flag,ym,balance,income,entropy_sptac,logins
0,1088993,2012-04-02,57,23.74,1982 01apr 12 the broadway fruiterer london gb,,,,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2016-10-30,273998,2015-08-15 21:37:00,natwest bank,current,2014-07-18,2017-10-23,True,,,,,,u,201204,,22542.600586,2.425209,0.0
1,1088994,2012-04-02,57,25.68,1982 01apr 12 waitrose 835 crouch end gb,waitrose,spend,household,0.0,n8 9,2010-05-13,10k to 20k,1987.0,2016-10-30,273998,2015-08-15 21:37:00,natwest bank,current,2014-07-18,2017-08-15,True,,waitrose,"food, groceries, household",,supermarket,u,201204,,22542.600586,2.425209,0.0


In [145]:
def user_logins(df, freq="m"):
    period = pd.Grouper(key="date", freq=freq)
    return (
        df.groupby(["user_id", period])
        .logins.sum()
        .groupby(["user_id"])
        .agg(["mean", "median", "std", "size"])
    )


user_logins_data = user_logins(df)

In [146]:
hd.inspect(user_logins_data)

(5,204, 4)


Unnamed: 0_level_0,mean,median,std,size
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
57,8.16,0.0,13.7545,100
107,11.605263,1.0,25.055181,76


In [147]:
user_logins_data.describe()

Unnamed: 0,mean,median,std,size
count,5204.0,5204.0,4992.0,5204.0
mean,6.809477,4.68284,7.701427,18.873751
std,13.70823,13.135486,11.108252,19.204074
min,0.0,0.0,0.0,1.0
25%,0.206643,0.0,0.707107,5.0
50%,1.838542,0.0,3.714036,11.0
75%,6.893939,3.0,10.035591,25.0
max,180.625,181.5,123.03658,100.0
