In [1]:
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *

In [2]:
def mapper_check(mapper):
    for feat_tup in mapper.features:
        print('col: {0}, mean: {1}, std_dev: {2}'.format(feat_tup[0][0], feat_tup[1].mean_, feat_tup[1].var_))

In [3]:
# generic setup with actual data I encountered
# Two 3 numerical columns are drawn from standard normal distribution.
# num_0 and num_1 data has been collected since inception. num_2 has only 
# started being collected recently, so it is null for most entries.
# categorical represents some new levels being added later, maybe as 
# a company expands into states/zip codes. Levels exist in the validation set
# that are not in the training set.

# num_0 and 1 have data since inception
train = pd.DataFrame(np.zeros([1000000,2]))
train[[0,1]] = np.random.normal(size=(1000000,2))
# num_2 has only started being collected recently
train[2] = np.nan
train[2][-100000:] = np.random.normal(size=(100000))
# categorical has only had 3 levels historically but valid has two more levels
train[3] = np.nan
train[3][:-50000] = np.random.choice([np.nan, 'a', 'b', 'c'], size=950000)
train.columns = ['num_0','num_1','num_2','categorical']

# taking the last 50000 records as valid, first 950000 as train
valid = train[-50000:].copy()
train = train[:-50000]
valid['categorical'][-50000:] = np.random.choice([np.nan, 'a', 'b', 'c', 'd', 'e'], size=50000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
print('unique vals in col "categorical": {0}'.format(len(train['categorical'].unique())))
train.describe()

unique vals in col "categorical": 4


Unnamed: 0,num_0,num_1,num_2
count,950000.0,950000.0,50000.0
mean,0.000417,0.000682,-0.003441
std,0.999406,1.000015,0.999647
min,-4.551844,-4.858162,-3.879602
25%,-0.672878,-0.674253,-0.678242
50%,0.00072,0.000916,-0.002547
75%,0.675054,0.675148,0.670807
max,4.818942,5.11252,4.052374


In [5]:
print('unique vals in col "categorical": {0}'.format(len(valid['categorical'].unique())))
valid.describe()

unique vals in col "categorical": 6


Unnamed: 0,num_0,num_1,num_2
count,50000.0,50000.0,50000.0
mean,0.000764,0.005412,0.001741
std,0.998002,0.99902,1.002388
min,-4.391096,-4.378382,-4.79224
25%,-0.6718,-0.671017,-0.676408
50%,3.3e-05,0.004079,0.005734
75%,0.679714,0.680993,0.673335
max,4.222815,3.917901,4.210958


In [6]:
train_cats(train)
X_train, y_train, nas_train, mapper_train = proc_df(train, do_scale=True)
mapper_check(mapper_train)

col: num_0, mean: [ 0.00042], std_dev: [ 0.99881]
col: num_1, mean: [ 0.00068], std_dev: [ 1.00003]
col: num_2, mean: [-0.00259], std_dev: [ 0.05259]
col: num_2_na, mean: [ 0.94737], std_dev: [ 0.04986]


# Standard deviation for col num_2 has dropped a lot due to filling na values and running sklearn's StandardScaler. We know the real distribution is standard normal (std_dev should be 1)

# I propose fitting a StandardScaler before filling nas so the distribution isn't changed.

# Having proc_df spit out the expected number of levels for each categorical was helpful when troubleshooting index errors with embeddings on different data sets.

In [7]:
train_cats(valid)
X_valid, y_valid, nas_valid, _ = proc_df(valid, do_scale=True, na_dict=nas_train)
assert(nas_train == nas_valid)

# proposed changes proc_df

In [8]:
from sklearn.base import TransformerMixin, BaseEstimator

class StandardScalerPandas(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.copy = copy
    
    def fit(self, X, y=None):
        # fits without nas via pandas library
        if type(X) == np.ndarray:
            X = pd.Series(X.reshape(-1))
        self.mean_ = X.dropna().mean()
        self.var_ = X.dropna().var()
        return self

    def transform(self, X):
        mean = self.mean_
        std_dev = np.sqrt(self.var_)
        if std_dev == 0:
            return X
        return (X-mean)/std_dev
    
def fit_scalers(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScalerPandas()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    return mapper

def count_emb_lvls(c, n):
    return (n, len(c.cat.categories)+1)

def proc_df_new(df, y_fld=None, skip_flds=None, do_scale=False, na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    # fit the scalers
    if do_scale: mapper = fit_scalers(df, mapper)    
    if na_dict is None: na_dict = {}
    # then fillna
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    # finally transform
    df[mapper.transformed_names_] = mapper.transform(df)
    # numericalize cats and count emb_lvls
    emb_lvls=[]
    for n,c in df.items():
        numericalize(df, c, n, max_n_cat)
        if not is_numeric_dtype(c):
            emb_lvls.append(count_emb_lvls(c, n))
    df = pd.get_dummies(df, dummy_na=True)
    # return the result
    res = [pd.get_dummies(df, dummy_na=True), y, na_dict, emb_lvls]
    if do_scale: res = res + [mapper]
    return res

In [9]:
X_train_n, y_train_n, nas_train_n, emb_lvls_train_n, mapper_train_n = proc_df_new(train, do_scale=True)
print(emb_lvls_train_n)
mapper_check(mapper_train_n)

[('categorical', 5)]
col: num_0, mean: 0.000417164075680511, std_dev: 0.9988131849472348
col: num_1, mean: 0.0006815402437186164, std_dev: 1.0000304658662287
col: num_2, mean: -0.0034411691905263434, std_dev: 0.999295076599445


In [10]:
X_valid_n, y_valid_n, nas_valid_n, emb_lvls_valid_n, mapper_valid_n = proc_df_new(valid, do_scale=True, na_dict=nas_train)
print(emb_lvls_valid_n)
assert(nas_train == nas_valid)

[('categorical', 7)]


In [19]:
# check numbers on train and valid sets, old proc_df on left, proc_df_new on the right

In [11]:
pd.concat([X_train.join(X_train_n, how='outer', rsuffix='_n').head(2),X_train.join(X_train_n, how='outer', rsuffix='_n').tail(2)])

Unnamed: 0,num_0,num_1,num_2,categorical,num_2_na,num_0_n,num_1_n,num_2_n,categorical_n,num_2_na_n
0,-2.160973,1.146524,0.000205,4,0.235702,-2.160972,1.146524,0.000895,4,True
1,-1.040743,0.110703,0.000205,2,0.235702,-1.040743,0.110703,0.000895,2,True
949998,-2.247012,-0.727538,-1.739665,2,-4.242641,-2.247011,-0.727538,-0.398255,2,False
949999,0.825047,1.078543,2.989957,4,-4.242641,0.825047,1.078543,0.686784,4,False


In [12]:
pd.concat([X_valid.join(X_valid_n, how='outer', rsuffix='_n').head(2),X_valid.join(X_valid_n, how='outer', rsuffix='_n').tail(2)])

Unnamed: 0,num_0,num_1,num_2,categorical,num_2_na,num_0_n,num_1_n,num_2_n,categorical_n,num_2_na_n
950000,0.339303,-1.855631,0.197993,6,0.0,0.3393,-1.855612,0.197991,6,False
950001,-0.056827,0.231256,1.283059,5,0.0,-0.056826,0.231254,1.283046,5,False
999998,-0.220468,1.357868,-0.833487,2,0.0,-0.220466,1.357854,-0.833479,2,False
999999,0.190175,-1.344253,-0.212788,3,0.0,0.190173,-1.344239,-0.212785,3,False


# checking module

In [13]:
import fastai.structured_new as s_new

In [18]:
X_train_n, y_train_n, nas_train_n, emb_lvls_train_n, mapper_train_n = s_new.proc_df(train, do_scale=True)
print(emb_lvls_train_n)
mapper_check(mapper_train_n)

[('categorical', 5)]
col: num_0, mean: 0.000417164075680511, std_dev: 0.9988131849472348
col: num_1, mean: 0.0006815402437186164, std_dev: 1.0000304658662287
col: num_2, mean: -0.0034411691905263434, std_dev: 0.999295076599445


# to write out

In [15]:
%%writefile structured_new.py
from .imports import *

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Imputer
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.ensemble import forest
from sklearn.tree import export_graphviz
from sklearn.base import TransformerMixin, BaseEstimator

class StandardScalerPandas(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.copy = copy
    
    def fit(self, X, y=None):
        # fits without nas via pandas library
        if type(X) == np.ndarray:
            X = pd.Series(X.reshape(-1))
        self.mean_ = X.dropna().mean()
        self.var_ = X.dropna().var()
        return self

    def transform(self, X):
        mean = self.mean_
        std_dev = np.sqrt(self.var_)
        if std_dev == 0:
            return X
        return (X-mean)/std_dev


def set_plot_sizes(sml, med, big):
    plt.rc('font', size=sml)          # controls default text sizes
    plt.rc('axes', titlesize=sml)     # fontsize of the axes title
    plt.rc('axes', labelsize=med)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=sml)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=sml)    # fontsize of the tick labels
    plt.rc('legend', fontsize=sml)    # legend fontsize
    plt.rc('figure', titlesize=big)  # fontsize of the figure title

def parallel_trees(m, fn, n_jobs=8):
        return list(ProcessPoolExecutor(n_jobs).map(fn, m.estimators_))

def draw_tree(t, df, size=10, ratio=0.6, precision=0):
    """ Draws a representation of a random forest in IPython.

    Parameters:
    -----------
    t: The tree you wish to draw
    df: The data used to train the tree. This is used to get the names of the features.
    """
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

def combine_date(years, months=1, days=1, weeks=None, hours=None, minutes=None,
              seconds=None, milliseconds=None, microseconds=None, nanoseconds=None):
    years = np.asarray(years) - 1970
    months = np.asarray(months) - 1
    days = np.asarray(days) - 1
    types = ('<M8[Y]', '<m8[M]', '<m8[D]', '<m8[W]', '<m8[h]',
             '<m8[m]', '<m8[s]', '<m8[ms]', '<m8[us]', '<m8[ns]')
    vals = (years, months, days, weeks, hours, minutes, seconds,
            milliseconds, microseconds, nanoseconds)
    return sum(np.asarray(v, dtype=t) for t, v in zip(types, vals)
               if v is not None)

def get_sample(df,n):
    """ Gets a random sample of n rows from df, without replacement.

    Parameters:
    -----------
    df: A pandas data frame, that you wish to sample from.
    n: The number of rows you wish to sample.

    Returns:
    --------
    return value: A random sample of n rows of df.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    >>> get_sample(df, 2)
       col1 col2
    1     2    b
    2     3    a
    """
    idxs = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idxs].copy()

def add_datepart(df, fldname, drop=True):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.

    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.

    Examples:
    ---------

    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df

        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13

    >>> add_datepart(df, 'A')
    >>> df

        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

def is_date(x): return np.issubdtype(x.dtype, np.datetime64)

def train_cats(df):
    """Change any columns of strings in a panda's dataframe to a column of
    catagorical values. This applies the changes inplace.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values.

    Examples:
    ---------

    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category
    """
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def apply_cats(df, trn):
    """Changes any columns of strings in df into categorical variables using trn as
    a template for the category codes.

    Parameters:
    -----------
    df: A pandas dataframe. Any columns of strings will be changed to
        categorical values. The category codes are determined by trn.

    trn: A pandas dataframe. When creating a category for df, it looks up the
        what the category's code were in trn and makes those the category codes
        for df.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category {a : 1, b : 2}

    >>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
    >>> apply_cats(df2, df)

           col1 col2
        0     1    b
        1     2    a
        2     3    a

    now the type of col is category {a : 1, b : 2}
    """
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)

def fix_missing(df, col, name, na_dict):
    """ Fill missing data in a column of df with the median, and add a {name}_na column
    which specifies if the data was missing.

    Parameters:
    -----------
    df: The data frame that will be changed.

    col: The column of data to fix by filling in missing data.

    name: The name of the new filled column in df.

    na_dict: A dictionary of values to create na's of and the value to insert. If
        name is not a key of na_dict the median will fill any missing data. Also
        if name is not a key of na_dict and there is no missing data in col, then
        no {name}_na column is not created.


    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col1'], 'col1', {})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1     2    2    True
    2     3    2   False


    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col2'], 'col2', {})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2


    >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
    >>> df
       col1 col2
    0     1    5
    1   nan    2
    2     3    2

    >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
    >>> df
       col1 col2 col1_na
    0     1    5   False
    1   500    2    True
    2     3    2   False
    """
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    """ Changes the column col from a categorical type to it's integer codes.

    Parameters:
    -----------
    df: A pandas dataframe. df[name] will be filled with the integer codes from
        col.

    col: The column you wish to change into the categories.
    name: The column name you wish to insert into df. This column will hold the
        integer codes.

    max_n_cat: If col has more categories than max_n_cat it will not change the
        it to its integer codes. If max_n_cat is None, then col will always be
        converted.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> numericalize(df, df['col2'], 'col3', None)

       col1 col2 col3
    0     1    a    1
    1     2    b    2
    2     3    a    1
    """
    if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1

def fit_scalers(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScalerPandas()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    return mapper

def proc_df(df, y_fld=None, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):

    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable

    skip_flds: A list of fields that dropped from df.

    do_scale: Standardizes each column in df,Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add. Na columns are also added if there
        are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values, instead
        of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable
        calculates the values used for scaling of variables during training time(mean and standard deviation).

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the response variable
            and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the associated median.
        
        emb_lvls: returns list of tuples where each tuple is categorical name and number of levels+1

        mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continous
        variables which is then used for scaling of during test-time.

    Examples:
    ---------
    >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
    >>> df
       col1 col2
    0     1    a
    1     2    b
    2     3    a

    note the type of col2 is string

    >>> train_cats(df)
    >>> df

       col1 col2
    0     1    a
    1     2    b
    2     3    a

    now the type of col2 is category { a : 1, b : 2}

    >>> x, y, nas = proc_df(df, 'col1')
    >>> x

       col2
    0     1
    1     2
    2     1

    >>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
                 children=[4., 6, 3, 3, 2, 3, 5, 4],
                 salary=[90, 24, 44, 27, 32, 59, 36, 27])

    >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
                          ([:children], StandardScaler())])

    >>>round(fit_transform!(mapper, copy(data)), 2)

    8x4 Array{Float64,2}:
    1.0  0.0  0.0   0.21
    0.0  1.0  0.0   1.88
    0.0  1.0  0.0  -0.63
    0.0  0.0  1.0  -0.63
    1.0  0.0  0.0  -1.46
    0.0  1.0  0.0  -0.63
    1.0  0.0  0.0   1.04
    0.0  0.0  1.0   0.21
    """
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    # fit the scalers
    if do_scale: mapper = fit_scalers(df, mapper)    
    if na_dict is None: na_dict = {}
    # then fillna
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    # finally transform
    df[mapper.transformed_names_] = mapper.transform(df)
    # numericalize cats and count emb_lvls
    emb_lvls=[]
    for n,c in df.items():
        numericalize(df, c, n, max_n_cat)
        if not is_numeric_dtype(c):
            emb_lvls.append(count_emb_lvls(c, n))
    df = pd.get_dummies(df, dummy_na=True)
    # return the result
    res = [pd.get_dummies(df, dummy_na=True), y, na_dict, emb_lvls]
    if do_scale: res = res + [mapper]
    return res

def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

def set_rf_samples(n):
    """ Changes Scikit learn's random forests to give each tree a random sample of
    n random rows.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n))

def reset_rf_samples():
    """ Undoes the changes produced by set_rf_samples.
    """
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))

def get_nn_mappers(df, cat_vars, contin_vars):
    # Replace nulls with 0 for continuous, "" for categorical.
    for v in contin_vars: df[v] = df[v].fillna(df[v].max()+100,)
    for v in cat_vars: df[v].fillna('#NA#', inplace=True)

    # list of tuples, containing variable and instance of a transformer for that variable
    # for categoricals, use LabelEncoder to map to integers. For continuous, standardize
    cat_maps = [(o, LabelEncoder()) for o in cat_vars]
    contin_maps = [([o], StandardScaler()) for o in contin_vars]
    return DataFrameMapper(cat_maps).fit(df), DataFrameMapper(contin_maps).fit(df)

def count_emb_lvls(c, n):
    return (n, len(c.cat.categories)+1)

Overwriting structured_new.py
