# Documenting sample selection (part 2)
<!-- > "Using decorator to document sample selection process." -->

- hide:true
- toc: true
- badges: true
- comments: true
- categories: [python]

In [1]:
from collections import Counter, OrderedDict
from functools import wraps

import pandas as pd
import numpy as np

First, build a counter object.

In [12]:
class OrderedCounter(Counter, OrderedDict):
    'Counter that remembers the order elements are first encountered'

    def __repr__(self):
        return '{!s}({!r})'.format(self.__class__.__name__, OrderedDict(self))

    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)
    
OrderedCounter({'a': 1, 'b': 2})

OrderedCounter(OrderedDict([('a', 1), ('b', 2)]))

Method resolution order:

In [3]:
OrderedCounter.__mro__

(__main__.OrderedCounter,
 collections.Counter,
 collections.OrderedDict,
 dict,
 object)

In [58]:
df = pd.read_csv('/Users/fgu/tmp/tab.csv').iloc[:, 1:]
df.head()

Unnamed: 0,step,counts
0,start-users,283.0
1,start-txs,662580.0
2,start-value,122286176.0
3,drop_noise-users,283.0
4,drop_noise-txs,662580.0


In [59]:
df[['step', 'metric']] = df.step.str.split('-', expand=True)
df.head()

Unnamed: 0,step,counts,metric
0,start,283.0,users
1,start,662580.0,txs
2,start,122286176.0,value
3,drop_noise,283.0,users
4,drop_noise,662580.0,txs


From here: https://stackoverflow.com/a/44988732/13666841

In [60]:
df = (df.groupby(['step', 'metric'], sort=False)
      .counts.sum()
      .unstack('metric')
      .rename_axis(columns=None)
      .reset_index())
df

Unnamed: 0,step,users,txs,value
0,start,283.0,662580.0,122286200.0
1,drop_noise,283.0,662580.0,122286200.0
2,min_number_of_months,239.0,653239.0,120708800.0
3,current_account,227.0,636792.0,118522300.0
4,min_spend,151.0,439269.0,79422540.0
5,min_number_transactions,129.0,384487.0,67765140.0
6,diverse_spending,129.0,384487.0,67765140.0
7,end,129.0,384487.0,67765140.0


In [68]:
# Formatting

df['step'] = df['step'].map(desc)
ints = ['users', 'acc', 'txs']
df[ints] = df[ints].applymap('{:,.0f}'.format)
floats = ['value']
df[floats] = df[floats].applymap('{:,.1f}'.format)

NameError: name 'desc' is not defined

In [62]:
k = df.select_dtypes('number').applymap('{:,.0f}'.format)
k

Unnamed: 0,users,txs,value
0,283,662580,122286176
1,283,662580,122286173
2,239,653239,120708756
3,227,636792,118522330
4,151,439269,79422540
5,129,384487,67765143
6,129,384487,67765143
7,129,384487,67765143


In [67]:
df[['value']].applymap(lambda x: x*10)

Unnamed: 0,value
0,1222862000.0
1,1222862000.0
2,1207088000.0
3,1185223000.0
4,794225400.0
5,677651400.0
6,677651400.0
7,677651400.0


In [None]:
df.to_latex

In [57]:
df[df.select_dtypes('number').columns] = k
df

Unnamed: 0,step,users,txs,value
0,start,283,662580,122286176
1,drop_noise,283,662580,122286173
2,min_number_of_months,239,653239,120708755
3,current_account,227,636792,118522329
4,min_spend,151,439269,79422540
5,min_number_transactions,129,384487,67765143
6,diverse_spending,129,384487,67765143
7,end,129,384487,67765143


In [37]:
fmt = {
    'users': '{:.0f}',
    'txs': '{:.0f}',
    'value': '{:,.0f}',
}
df = df.style.format(fmt)
df

Unnamed: 0,step,users,txs,value
0,start,283,662580,122286176
1,drop_noise,283,662580,122286173
2,min_number_of_months,239,653239,120708756
3,current_account,227,636792,118522330
4,min_spend,151,439269,79422540
5,min_number_transactions,129,384487,67765143
6,diverse_spending,129,384487,67765143
7,end,129,384487,67765143


In [38]:
df = df.select_dtypes('number').astype('str')
df

AttributeError: 'Styler' object has no attribute 'select_dtypes'

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   users   8 non-null      object
 1   txs     8 non-null      object
 2   value   8 non-null      object
dtypes: object(3)
memory usage: 320.0+ bytes


ValueError: Cannot specify ',' with 's'.

<pandas.io.formats.style.Styler at 0x119db1210>

In [None]:
df.to_csv

Unnamed: 0,step,users,txs,value
0,start,283,662580,122286176
1,drop_noise,283,662580,122286173
2,min_number_of_months,239,653239,120708756
3,current_account,227,636792,118522330
4,min_spend,151,439269,79422540
5,min_number_transactions,129,384487,67765143
6,diverse_spending,129,384487,67765143
7,end,129,384487,67765143


In [44]:
df = df.set_index(['step', 'metric'])
df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,counts
step,metric,Unnamed: 2_level_1
start,users,283.0
start,txs,662580.0
start,value,122286176.0
drop_noise,users,283.0
drop_noise,txs,662580.0


In [45]:
df.unstack(1)

Unnamed: 0_level_0,counts,counts,counts
metric,txs,users,value
step,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
current_account,636792.0,227.0,118522300.0
diverse_spending,384487.0,129.0,67765140.0
drop_noise,662580.0,283.0,122286200.0
end,384487.0,129.0,67765140.0
min_number_of_months,653239.0,239.0,120708800.0
min_number_transactions,384487.0,129.0,67765140.0
min_spend,439269.0,151.0,79422540.0
start,662580.0,283.0,122286200.0


In [41]:
df.columns.droplevel(0)
df.columns.name = None
df.head()

Unnamed: 0_level_0,counts,counts,counts
metric,txs,users,value
step,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
current_account,636792.0,227.0,118522300.0
diverse_spending,384487.0,129.0,67765140.0
drop_noise,662580.0,283.0,122286200.0
end,384487.0,129.0,67765140.0
min_number_of_months,653239.0,239.0,120708800.0


In [19]:
dd.drop

AttributeError: 'NoneType' object has no attribute 'drop'

In [11]:
a = OrderedCounter()
a.update(s_a=1, s_b=2, k_a=3, k_b=4)
a.update(s_a=5, s_b=6, k_a=7, k_b=8)
a

OrderedCounter(OrderedDict([('s_a', 6), ('s_b', 8), ('k_a', 10), ('k_b', 12)]))

Turn it into a dataframe.

In [12]:
df = pd.DataFrame(a.items(), columns=['step', 'count'])
df

Unnamed: 0,step,count
0,s_a,6
1,s_b,8
2,k_a,10
3,k_b,12


Split the step column

In [13]:
df[['step', 'metric']] = df.step.str.split('_', expand=True)
df

Unnamed: 0,step,count,metric
0,s,6,a
1,s,8,b
2,k,10,a
3,k,12,b


Unstack

In [6]:
df = df.set_index(['step', 'metric']).unstack(1)
df

Unnamed: 0_level_0,count,count
metric,a,b
step,Unnamed: 1_level_2,Unnamed: 2_level_2
k,10,12
s,6,8


Flatten

In [7]:
df.columns = df.columns.droplevel(0)
df

metric,a,b
step,Unnamed: 1_level_1,Unnamed: 2_level_1
k,10,12
s,6,8


In [126]:
df.columns.name = None

In [127]:
df

Unnamed: 0_level_0,a,b
step,Unnamed: 1_level_1,Unnamed: 2_level_1
k,10,12
s,6,8


In [128]:
df.reset_index()

Unnamed: 0,step,a,b
0,k,10,12
1,s,6,8


## Main sources

- [Fluent Python](https://www.oreilly.com/library/view/fluent-python/9781491946237/)
- [Python Cookbook](https://www.oreilly.com/library/view/python-cookbook-3rd/9781449357337/)
- [Learning Python](https://www.oreilly.com/library/view/learning-python-5th/9781449355722/)