<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Introduction-to-pandas" data-toc-modified-id="Introduction-to-pandas-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction to pandas</a></span></li><li><span><a href="#Add-new-columns-saving-Nan-values-get_dummies_nan" data-toc-modified-id="Add-new-columns-saving-Nan-values-get_dummies_nan-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Add new columns saving Nan values <code>get_dummies_nan</code></a></span></li><li><span><a href="#Compute-nan-features-for-rows:-nan_features" data-toc-modified-id="Compute-nan-features-for-rows:-nan_features-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Compute nan features for rows: <code>nan_features</code></a></span></li><li><span><a href="#Substitute-np.NaN-by-other-values:-MissingImputer-class" data-toc-modified-id="Substitute-np.NaN-by-other-values:-MissingImputer-class-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Substitute <code>np.NaN</code> by other values: <code>MissingImputer</code> class</a></span></li><li><span><a href="#Make-new-columns-from-a-series-containing-lists" data-toc-modified-id="Make-new-columns-from-a-series-containing-lists-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Make new columns from a series containing lists</a></span></li></ul></div>

# Introduction to pandas

In [13]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from collections.abc import Iterable 

# Add new columns saving Nan values `get_dummies_nan` 

In [14]:
df_cars = pd.DataFrame([[2,"mercedes","middleclass"], 
                        [np.NaN,"mercedes","middleclass"],
                        [3,"Audi",np.NaN]],
                        columns= ["members","vehicles","status"])


In [15]:
print(df_cars)

   members  vehicles       status
0      2.0  mercedes  middleclass
1      NaN  mercedes  middleclass
2      3.0      Audi          NaN


In [16]:

def contains_nan(df_col):
    '''
    `contains_nan` checks if a certain column has nans
    '''
    return df_col.isna().any()


def get_dummies_nan(df, return_nancols=False,inplace=False):
    '''
    `get_dummies_nan` creates a new dataframe with binary columns stating wheather variables contain NaNs.
    
    
    
    Examples:
    --------
    
    
    >>> df = pd.DataFrame([[2,"mercedes","middleclass"], 
                           [np.NaN,"mercedes","middleclass"],
                           [3,"Audi",np.NaN]],
                           columns= ["members","vehicles","status"])
    
    >>> df 

           members  vehicles       status
    0      2.0  mercedes  middleclass
    1      NaN  mercedes  middleclass
    2      3.0      Audi          NaN
    
    >>> df_ = get_dummies_nan(df)
    
    >>> df_
    
           members  vehicles       status  members_nan  status_nan
    0      2.0  mercedes  middleclass        False       False
    1      NaN  mercedes  middleclass         True       False
    2      3.0      Audi          NaN        False        True

    >>> df_, nancols = get_dummies_nan(df, return_nancols=True)
    
    >>> nancols
    
    ['members', 'status']

    '''
    def add_nan_columns(df):
        cols_with_nan = []
        for c in df.columns:
            if contains_nan(df[c]):
                cols_with_nan.append(c)
                df[c + "_nan"] = df[c].isna().values    
        return df, cols_with_nan


    if inplace:
        df, cols_with_nan = add_nan_columns(df) 
        if return_nancols:
            return cols_with_nan
    else:
        df_copy = df.copy(deep=True)
        df_copy, cols_with_nan = add_nan_columns(df_copy) 
        
        if return_nancols:
            return df_copy, cols_with_nan
        else:
            return df_copy

In [17]:
df_= get_dummies_nan(df_cars)

In [18]:
df_

Unnamed: 0,members,vehicles,status,members_nan,status_nan
0,2.0,mercedes,middleclass,False,False
1,,mercedes,middleclass,True,False
2,3.0,Audi,,False,True


In [19]:
df_, nancols = get_dummies_nan(df_cars, return_nancols=True)

In [20]:
nancols

['members', 'status']

# Compute nan features for rows: `nan_features` 

In [21]:

def _nan_rowfeatures(df, reduce_methods=[np.mean, np.std], distances=False, inplace=False):
    
    """
    `nan_features` generates new features containing for a given row k the "statistic" returned by a reduce
    operation on row k.
    
    t also computes the difference between the most value of the transformed nans per row and the found value.

    Examples:
    ---------
    >>> df = pd.DataFrame([[2,["p","b",None]], 
                   [3,["a","c",None]],
                  [3,["d","w","a"]]],columns= ["first","second"])
                  
    >>> df
    
       first        second
    0      2  [p, b, None]
    1      3  [a, c, None]
    2      3     [d, w, a]
    
    >>> df_ = nan_rowfeatures(df)
    
    >>> df_

        members  vehicles       status  mean_nans  std_nans
    0      2.0  mercedes  middleclass   0.000000  0.000000
    1      NaN  mercedes  middleclass   0.333333  0.471405
    2      3.0      Audi          NaN   0.333333  0.471405

    """
    def create_col(df, reducer):
        colname     = reducer.__name__ + '_rownans'
        df[colname] = df_cars.apply(lambda x: reducer(pd.isna(x)), axis=1)
        if distances:
            mode        = stats.mode(df[colname])[0]  
            df[colname + '_l1_to_mode' ] = np.abs(df[colname] - mode)
            df[colname + '_l2_to_mode']  = (df[colname] - mode)**2
            
    if inplace:
        for reducer in reduce_methods:
            create_col(df, reducer)
    else:
        df_copy = df.copy(deep=True)
        
        for reducer in reduce_methods:
            create_col(df_copy, reducer)
        return df_copy
    
    

def nan_rowfeatures(df, reduce_methods=[np.mean, np.std], distances=False, inplace=False):
    
    """
    `nan_features` generates new features containing for a given row k the "statistic" returned by a reduce
    operation on row k.
    
    It also computes the difference between the most value of the transformed nans per row and the found value.
    
    Examples:
    ---------
    >>> df = pd.DataFrame([[2,["p","b",None]], 
                   [3,["a","c",None]],
                  [3,["d","w","a"]]],columns= ["first","second"])
                  
    >>> df
    
       first        second
    0      2  [p, b, None]
    1      3  [a, c, None]
    2      3     [d, w, a]
    
    >>> df_ = nan_rowfeatures(df)
    
    >>> df_

       members  vehicles       status  mean_rownans  std_rownans
    0      2.0  mercedes  middleclass      0.000000     0.000000
    1      NaN  mercedes  middleclass      0.333333     0.433013
    2      3.0      Audi          NaN      0.333333     0.433013
    
    >>> df_ = nan_rowfeatures(df, distances=True)

           members  vehicles       status  mean_rownans  mean_rownans_l1_to_mode  \
    0      2.0  mercedes  middleclass      0.000000                 0.333333   
    1      NaN  mercedes  middleclass      0.333333                 0.000000   
    2      3.0      Audi          NaN      0.333333                 0.000000   

       mean_rownans_l2_to_mode  std_rownans  std_rownans_l1_to_mode  \
    0                 0.111111     0.000000                0.372678   
    1                 0.000000     0.372678                0.000000   
    2                 0.000000     0.372678                0.000000   

       std_rownans_l2_to_mode  
    0                0.138889  
    1                0.000000  
    2                0.000000  

    """
    
    def create_col(df, reducer):
        colname                       = reducer.__name__ + '_rownans'
        df[colname]                   = reducer(pd.isna(df),axis=1)
        if distances:
            mode                          = stats.mode(df[colname])[0]  
            df[colname + '_l1_to_mode']   = np.abs(mode - df[colname])
            df[colname + '_l2_to_mode']   = (mode - df[colname])**2
            
    if inplace:
        for reducer in reduce_methods:
            create_col(df, reducer)
    else:
        df_copy = df.copy(deep=True)
        
        for reducer in reduce_methods:
            create_col(df_copy, reducer)

        return df_copy

In [22]:
%%time
_nan_rowfeatures(df_cars)

CPU times: user 8 ms, sys: 8 ms, total: 16 ms
Wall time: 96.8 ms


Unnamed: 0,members,vehicles,status,mean_rownans,std_rownans
0,2.0,mercedes,middleclass,0.0,0.0
1,,mercedes,middleclass,0.333333,0.471405
2,3.0,Audi,,0.333333,0.471405


In [23]:
%%time
nan_rowfeatures(df_cars)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 10.2 ms


Unnamed: 0,members,vehicles,status,mean_rownans,std_rownans
0,2.0,mercedes,middleclass,0.0,0.0
1,,mercedes,middleclass,0.333333,0.433013
2,3.0,Audi,,0.333333,0.433013


In [24]:
nan_rowfeatures(df_cars, distances=True)

Unnamed: 0,members,vehicles,status,mean_rownans,mean_rownans_l1_to_mode,mean_rownans_l2_to_mode,std_rownans,std_rownans_l1_to_mode,std_rownans_l2_to_mode
0,2.0,mercedes,middleclass,0.0,0.333333,0.111111,0.0,0.372678,0.138889
1,,mercedes,middleclass,0.333333,0.0,0.0,0.372678,0.0,0.0
2,3.0,Audi,,0.333333,0.0,0.0,0.372678,0.0,0.0


# Substitute `np.NaN` by other values: `MissingImputer` class

In [26]:
from sklearn.base import BaseEstimator, TransformerMixin


In [81]:

class _MissingImputer(BaseEstimator, TransformerMixin):
    '''
    `MissingImputer` implements a `fit` and `transform` methods that enable replacing np.NaN values by numerical values.

    This class uses an insane amount of RAM.
    '''
    def __init__(self, treatment="mean"):
        self._allowed_treatments = ["fixed_value", "mean",'median','mode','None',"most_frequent"]     
        assert treatment in self._allowed_treatments or isinstance(treatment,(int,float)),  "the treatment introduced {} is not valid. Please use one in {}".format(treatment, self._allowed_treatments)
        self.treatment = treatment
    
    def fit(self, X, y):
        """
        Learns statistics to impute nans.
        """
        
        if self.treatment == "mean" or self.treatment==None:
            self.treatment_method = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean')
        elif self.treatment == "median":
            self.treatment_method = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='median')
        elif self.treatment == "most_frequent":
            self.treatment_method = sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        elif isinstance(self.treatment, (int,float)):
            self.treatment_method = sklearn.impute.SimpleImputer(missing_values=np.nan,
                                                                 strategy="constant",fill_value=self.treatment)       
        
        if isinstance(X, np.ndarray):   
            self.treatment_method.fit(X)
            
        if isinstance(X, pd.DataFrame):       
            self.treatment_method.fit(X.values)
        
        return self

    def transform(self, X):
        if self.treatment==None:
            return X
        return self.treatment_method.transform(X)



In [383]:
import sklearn
from sklearn import *
d = sklearn.datasets.load_iris()
X, y = d["data"], d["target"]

X[0,0] = np.NaN
X[2,3] = np.NaN
X[0,2] = np.NaN

In [384]:
X[0:3]

array([[nan, 3.5, nan, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, nan]])

In [385]:
imputer = _MissingImputer(treatment="mean")
imputer.fit(X,y)
imputer.transform(X)[0:3]

array([[5.84832215, 3.5       , 3.7738255 , 0.2       ],
       [4.9       , 3.        , 1.4       , 0.2       ],
       [4.7       , 3.2       , 1.3       , 1.20604027]])

In [386]:
imputer = _MissingImputer(treatment="median")
imputer.fit(X,y)
imputer.transform(X)[0:3]

array([[5.8, 3.5, 4.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 1.3]])

In [387]:
imputer = _MissingImputer(treatment="most_frequent")
imputer.fit(X,y)
imputer.transform(X)[0:3]

array([[5. , 3.5, 1.5, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [388]:
%%timeit
imputer = _MissingImputer(treatment="most_frequent")
imputer.fit(X,y)

778 µs ± 39.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


##### Improving MissingImputer

In [391]:

class NaNTransformer():
    def __init__(self, reducer=stats.mode):
        self.reducer = reducer
        self.col_id  = None 
        
    def fit(self, X_col, col_id):
        self.col_id = col_id
        self.learned_value = self.reducer(X_col)[0]
        
    def transform(self, X_col):
        X_col[np.isnan(X_col)] = self.learned_value
        return X_col

def itercols(X):
    
    if isinstance(X, np.ndarray):
        for i in range(X.shape[1]):
            yield i, X[:,i]
            
    if isinstance(X, pd.DataFrame):
        for colname in X:
            yield colname, X[colname]  
        
        
class MissingImputer(BaseEstimator, TransformerMixin):
    '''
    `MissingImputer` implements a `fit` and `transform` methods that enable replacing np.NaN values by numerical values.

    This class less RAM han _MissingImputer
    '''
    def __init__(self, treatment=stats.mode):
        self._allowed_treatments = [np.mean, np.median, stats.mode]     
        assert treatment in self._allowed_treatments or isinstance(treatment,(int,float)),  "the treatment introduced {} is not valid. Please use one in {}".format(treatment, self._allowed_treatments)
        self.treatment = treatment
    
    def fit(self, X, y):
        """
        Learns statistics to impute nans.
        """
        
        col_transformers = {}
        
        for col_id, X_col in itercols(X):
            nan_transformer = NaNTransformer(reducer=self.treatment) 
            nan_transformer.fit(X_col, col_id = col_id)
            col_transformers[col_id] = nan_transformer            
                
        self.col_transformers = col_transformers
        return self

    def transform(self, X):
        
        if self.treatment==None:
            return X
        
        for col_id, X_col in itercols(X):
            X_col = self.col_transformers[col_id].transform(X_col)
            
        return X



In [392]:
import sklearn
from sklearn import *
d = sklearn.datasets.load_iris()
X, y = d["data"], d["target"]

X[0,0] = np.NaN
X[2,3] = np.NaN
X[0,2] = np.NaN

In [393]:
X[0:4]

array([[nan, 3.5, nan, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, nan],
       [4.6, 3.1, 1.5, 0.2]])

In [394]:
imputer = MissingImputer()

In [395]:
imputer.fit(X,y)

MissingImputer(treatment=<function mode at 0x7f13601ca598>)

In [396]:
imputer.col_transformers

{0: <__main__.NaNTransformer at 0x7f135a6abf28>,
 1: <__main__.NaNTransformer at 0x7f135a6abc50>,
 2: <__main__.NaNTransformer at 0x7f135a6abb38>,
 3: <__main__.NaNTransformer at 0x7f135a6a5668>}

In [397]:
%%timeit
imputer = MissingImputer(treatment=stats.mode)
imputer.fit(X,y)

a = imputer.transform(X)

489 µs ± 25.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Make new columns from a series containing lists

Column `vehicles` contains lists with different vehicle names. Let us assume we consider this feature to be a list or the ordered vehicles a family has.

For example: Family 0 has 2 vehicles, and the most used one is a Mercedes, then a Toyota.

Now we want to create 3 features from this column: `vehicle_1`, `vehicle_2`, `vehicle_3` and write the different
models in the corresponding columns

In [None]:
df_cars = pd.DataFrame([[2,["mercedes","toyota",None],"middleclass"], 
                        [3,["Renault","Mercedes",None],"middleclass"],
                        [3,["Audi","Mercedes","Tesla"],"uppermiddleclass"]],
                        columns= ["members","vehicles","status"])

df_cars

In [None]:
def proc_df_collist(df: pd.DataFrame, colname: str, inplace=False):
    """
    
    `proc_df_collist` takes a dataframe and a column made of lists and generates new columns containing
    values from the lists. For each position in the list it generates a new column. The nimber of generated
    columns equals the length of the largest list in `df[colname]`. Each new collumn  k is filled
    with the values of the lists at position k. If the value does not exist (because the position does not exist)
    the position is filled with `NaN`. 
    
    Given `df`  and `colname`, create as many new columns as `len(df[colname].iloc[0])`
    Write in column `colname_k[j]` the value found `df[colname].iloc[j][k]`.
    
    
    Examples:
    ---------
    >>> df = pd.DataFrame([[2,["p","b",None]], 
                   [3,["a","c",None]],
                  [3,["d","w","a"]]],columns= ["first","second"])
                  
    >>> df
    
       first        second
    0      2  [p, b, None]
    1      3  [a, c, None]
    2      3     [d, w, a]

    >>> newcols = proc_df_collist(df, "second")

    >>> newcols
          second_0 second_1 second_2
    0        p        b     None
    1        a        c     None
    2        d        w        a

    
    >>> df2 = pd.DataFrame([[2,["p"]], 
                   [3,["a",2,3]],
                   [3,[4]]],columns= ["A","B"])
                   
    >>> df2
       A          B
    0  2        [p]
    1  3  [a, 2, 3]
    2  3        [4]

    >>> proc_df_collist(df2, "B")

      B_0  B_1  B_2
    0   p  NaN  NaN
    1   a  2.0  3.0
    2   4  NaN  NaN

    """
    assert isinstance(df, pd.DataFrame), "type(df)={} but it should be pd.DataFrame".format(type(df))
    assert isinstance(colname, str), "type(columname)={} but it should be str".format(type(str))
    assert isinstance(df[colname].iloc[0],(list,set, np.ndarray)), "type(df[colname].iloc[0])={} but it, should be in [list, set, np.ndarray]".format(type(df[colname].iloc[0]))
    
    
    #n_new_cols = len(df[colname].iloc[0])
    n_new_cols = df[colname].apply(len).max()
    colnames   = [colname + "_" + str(i) for i in range(n_new_cols)]   
    
    return pd.DataFrame(df[colname].tolist(), columns=colnames)

In [None]:
df = pd.DataFrame([[2,["p","b",None]], 
                   [3,["a","c",None]],
                   [3,["d","w","a"]]],columns= ["first","second"])


In [None]:
df

In [None]:
proc_df_collist(df, "second")

If the column passed to `proc_df_collist` contains iterables with different sizes then it will generate as many colums as the longest iterable. Then it will fill with `NaN`  positions where we don't have information

In [None]:
df2 = pd.DataFrame([[2,["p"]], 
                   [3,["a",2,3]],
                   [3,[4]]],columns= ["A","B"])

In [None]:
proc_df_collist(df2, "B")