In [None]:
# default_exp core

# mydemo

> A super demo for nbdev.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export 

import pandas.util.testing as tm
import pandas as pd
from pandas.api.types import is_numeric_dtype as isnum
#from matplotlib.pyplot import rcParams

  """


# Generalized Discretization

Discretize a whole dataframe into at most $N$ categories:
* Bin numerics into $≤N$ bins.
* Use only the Top $N$ categories, and "Other".

For QuickLooks, BN learning, and other household uses.


**TODO:** Try Maya Gilad's [approach](https://medium.com/gett-engineering/handling-rare-categorical-values-in-pandas-d1e3f17475f0) -- move the _bottom x%_ into 'Other':
```python
field = df[FILENAME]
field.mask(field.map(
    field.value_counts(normalize=True)) < 0.01, 'Other')
```



### is_numeric

In [None]:
# export

def is_numeric(col:str):
    """Returns True iff already numeric, or can be coerced.
    Usage: df.apply(is_numeric)
    Usage: is_numeric(df['colname'])

    Returns Boolean series.

    From:
    https://stackoverflow.com/questions/54426845/how-to-check-if-a-pandas-dataframe-contains-only-numeric-column-wise

    """
    return isnum(col) or pd.to_numeric(col, errors='coerce').notnull().all()

Always define tests!  

_(Usually do not export them!)_

In [None]:
df = tm.makeMixedDataFrame()
assert all(df.apply(is_numeric) == [True, True, False, True])

### drop_singletons

In [None]:
# export

def drop_singletons(df, verbose=1) -> None:
    """Drop columns with < 2 unique values. Inplace."""
    dropcols = []
    for col in df:
        if len(df[col].unique()) < 2:
            df.drop(columns=col, inplace=True)
            dropcols.append(col)
    if verbose:
        print(f"  DROPPED {dropcols} because < 2 vals each.")

Note that pd.NA and np.nan are values, so columns with *only* NA will be dropped, but columns NA and one other value remain.

In [None]:
df = tm.makeMixedDataFrame()
df['E'] = [1, 1, 1, 1, 1]
df['F'] = pd.Series([1, 1, 1, None, None]).astype('UInt8')
df['G'] = pd.Series([1, 1, 1, None, None]).astype('float')
df['H'] = pd.Series([None]*5).astype('UInt8')

drop_singletons(df)

  DROPPED ['E', 'H'] because < 2 vals each.


In [None]:
assert all(df.columns == ['A', 'B', 'C', 'D', 'F', 'G'])

### discretize

Woo-hoo!  It's all been leading to this.

Seriously, these headers are redundant -- nbdev will _generate_ nice-looking docs using the function names and docstrings.  

But that requires Github or setting up jekyll, and I broke my env. 

In [None]:
# export

def discretize(df, nbins=10, cut=pd.qcut,
               verbose=2, drop_useless=True):
    """Discretize columns in {df} to have at most {nbins} categories.
      * Categorical columns: take the Top n-1 plus "Other"
      * Continuous columns: cut into {nbins} using {cut}.

    Returns a new discretized dataframe with the same column names.
    Promotes discrete columns to categories.

    Parameters
    -----------
    df: Dataframe to discretize
    nbins: Max number of bins to use. May return fewer.
    cut: Cutting method. Default `pd.qcut`. Consider pd.cut, or write your own.
    verbose: 0: silent, 1: colnames, 2: (Default) top N for each column
    drop_useless: Removes columns that have < 2 unique values.

    Replaces numerical NA values with 'NA'.

    """
    out = pd.DataFrame(index = df.index)
    isnum = df.apply(is_numeric)
    for col in df:
        if verbose > 0:
            print(col, end=':\n\t')
        if isnum[col]:
            out[col] = cut(df[col], nbins, duplicates='drop')
        else:
            topN = df[col].value_counts(dropna=False).head(nbins).keys()
            # Promote to Category, add Other, Drop all but TopN
            out[col] = df[col].astype('category')
            try:
                out[col] = out[col].cat.add_categories(['Other'])
            except ValueError:
                pass  # Already had 'Other'
            out[col] = out[col].where(out[col].isin(topN), 'Other')
            out[col] = out[col].cat.remove_unused_categories()
        if verbose > 1:
            print('\n\t'.join(out[col].value_counts(dropna=False, sort=False)\
                              .to_string().split('\n')))
        elif verbose > 0:
            print()
    if drop_useless is True:
        drop_singletons(out)
    return out

This should drop 'B' as a singleton, bin the two continuous cols, and convert 'C' into 'foo3', 'foo4', and 'Other'.

In [None]:
df = tm.makeMixedDataFrame()
df = discretize(df, nbins=2)

A:
	(-0.001, 2.0]    3
	(2.0, 4.0]       2
B:
	(-0.001, 1.0]    5
C:
	foo3     1
	foo4     1
	Other    3
D:
	(2008-12-31 23:59:59.999999999, 2009-01-05]    3
	(2009-01-05, 2009-01-07]                       2
  DROPPED ['B'] because < 2 vals each.


In [None]:
assert all(df.columns == ['A', 'C', 'D'])

This is more of a "visual" test - no `assert` statement to fail.

In [None]:
df.A.unique()

[(-0.001, 2.0], (2.0, 4.0]]
Categories (2, interval[float64]): [(-0.001, 2.0] < (2.0, 4.0]]

In [None]:
# Hang on, not a stable test -- undefined ordering when group to Other.

# assert all(df.C.unique() == ['Other', 'foo2', 'foo4'])

## Plotting helpers...