In [1]:
from fastai.imports import *
from fastai.structured import *
from fastai.column_data import *

In [2]:
def mapper_check(mapper):
    for feat_tup in mapper.features:
        print('col: {0}, mean: {1}, std_dev: {2}'.format(feat_tup[0][0], feat_tup[1].mean_, feat_tup[1].var_))

In [3]:
train = pd.DataFrame(np.zeros([1000000,2]))
train[[0,1]] = np.random.normal(size=(1000000,2))
train[2] = np.nan
train[2][-100000:] = np.random.normal(size=(100000))
train[3] = np.nan
train[3][:-50000] = np.random.choice([np.nan, 'a', 'b', 'c'], size=950000)
train.columns = ['num_0','num_1','num_2','categorical']
valid = train[-50000:].copy()
train = train[:-50000]
valid['categorical'][-50000:] = np.random.choice([np.nan, 'a', 'b', 'c', 'd', 'e'], size=50000)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [4]:
print('unique vals in col "categorical": {0}'.format(len(train['categorical'].unique())))
train.describe()

unique vals in col "categorical": 4


Unnamed: 0,num_0,num_1,num_2
count,950000.0,950000.0,50000.0
mean,0.001484,0.000419,0.007199
std,1.000304,1.001274,0.996604
min,-4.736283,-4.927042,-4.207608
25%,-0.674277,-0.674927,-0.662512
50%,0.001677,-0.000116,0.007616
75%,0.677395,0.675339,0.676824
max,4.940119,4.869177,3.940715


In [5]:
print('unique vals in col "categorical": {0}'.format(len(valid['categorical'].unique())))
valid.describe()

unique vals in col "categorical": 6


Unnamed: 0,num_0,num_1,num_2
count,50000.0,50000.0,50000.0
mean,0.000935,-0.004427,-0.008899
std,0.995222,1.002839,0.997322
min,-4.158127,-3.929174,-3.866122
25%,-0.671504,-0.67685,-0.684508
50%,0.006233,-0.005086,-0.01474
75%,0.670213,0.675853,0.663687
max,3.811489,4.477588,4.019393


In [6]:
train_cats(train)
X_train, y_train, nas_train, mapper_train = proc_df(train, do_scale=True)
mapper_check(mapper_train)

col: num_0, mean: [ 0.00148], std_dev: [ 1.00061]
col: num_1, mean: [ 0.00042], std_dev: [ 1.00255]
col: num_2, mean: [ 0.00759], std_dev: [ 0.05227]
col: num_2_na, mean: [ 0.94737], std_dev: [ 0.04986]


In [7]:
train_cats(valid)
X_valid, y_valid, nas_valid, _ = proc_df(valid, do_scale=True, na_dict=nas_train)
assert(nas_train == nas_valid)

# proposed changes proc_df

In [8]:
from sklearn.base import TransformerMixin, BaseEstimator

class StandardScalerNew(TransformerMixin, BaseEstimator):
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.copy = copy
    
    def fit(self, X, y=None):
        # fits without nas via pandas library
        if type(X) == np.ndarray:
            X = pd.Series(X.reshape(-1))
        self.mean_ = X.dropna().mean()
        self.var_ = X.dropna().var()
        return self

    def transform(self, X):
        mean = self.mean_
        std_dev = np.sqrt(self.var_)
        if std_dev == 0:
            return X
        return (X-mean)/std_dev
    
def fit_scalers(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScalerNew()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    return mapper

def count_emb_lvls(c, n):
    return (n, len(c.cat.categories)+1)

def proc_df_new(df, y_fld=None, skip_flds=None, do_scale=False, na_dict=None, preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    # fit the scalers
    if do_scale: mapper = fit_scalers(df, mapper)    
    if na_dict is None: na_dict = {}
    # then fillna
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    # finally transform
    df[mapper.transformed_names_] = mapper.transform(df)
    # numericalize cats and count emb_lvls
    emb_lvls=[]
    for n,c in df.items():
        numericalize(df, c, n, max_n_cat)
        if not is_numeric_dtype(c):
            emb_lvls.append(count_emb_lvls(c, n))
    df = pd.get_dummies(df, dummy_na=True)
    # return the result
    res = [pd.get_dummies(df, dummy_na=True), y, na_dict, emb_lvls]
    if do_scale: res = res + [mapper]
    return res

In [9]:
X_train_n, y_train_n, nas_train_n, emb_lvls_train_n, mapper_train_n = proc_df_new(train, do_scale=True)
print(emb_lvls_train_n)
mapper_check(mapper_train_n)

[('categorical', 5)]
col: num_0, mean: 0.0014843667751694922, std_dev: 1.0006075935637342
col: num_1, mean: 0.00041876392460569605, std_dev: 1.0025497097410379
col: num_2, mean: 0.007199023519144798, std_dev: 0.9932198460835925


In [10]:
X_valid_n, y_valid_n, nas_valid_n, emb_lvls_valid_n, mapper_valid_n = proc_df_new(valid, do_scale=True, na_dict=nas_train)
print(emb_lvls_valid_n)
assert(nas_train == nas_valid)

[('categorical', 7)]


In [16]:
pd.concat([X_train.join(X_train_n, how='outer', rsuffix='_n').head(2),X_train.join(X_train_n, how='outer', rsuffix='_n').tail(2)])

Unnamed: 0,num_0,num_1,num_2,categorical,num_2_na,num_0_n,num_1_n,num_2_n,categorical_n,num_2_na_n
0,0.898383,-0.093202,9.6e-05,4,0.235702,0.898382,-0.093202,0.000418,4,True
1,-1.160443,-0.924158,9.6e-05,3,0.235702,-1.160442,-0.924157,0.000418,3,True
949998,-0.896605,-0.1744,-7.596479,3,-4.242641,-0.896604,-0.1744,-1.742338,3,False
949999,-0.594665,-1.370013,-6.794065,3,-4.242641,-0.594665,-1.370012,-1.558254,3,False


In [17]:
pd.concat([X_valid.join(X_valid_n, how='outer', rsuffix='_n').head(2),X_valid.join(X_valid_n, how='outer', rsuffix='_n').tail(2)])

Unnamed: 0,num_0,num_1,num_2,categorical,num_2_na,num_0_n,num_1_n,num_2_n,categorical_n,num_2_na_n
950000,0.469347,-0.293232,-0.44511,1,0.0,0.469342,-0.293229,-0.445105,1,False
950001,1.142744,-1.798259,0.536568,4,0.0,1.142733,-1.798241,0.536562,4,False
999998,-0.533348,-0.551918,-0.562895,6,0.0,-0.533343,-0.551913,-0.562889,6,False
999999,2.912771,0.267068,-0.583958,5,0.0,2.912742,0.267066,-0.583952,5,False
