In [64]:
from itertools import groupby

import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import Imputer

**1. Build dataframe with missing data**

In [65]:
df_init = pd.DataFrame(np.arange(5*3).reshape(5, 3), 
                  index=['a', 'c', 'e', 'f', 'h'],
                  columns=['one', 'two', 'three'])

In [66]:
df_init

Unnamed: 0,one,two,three
a,0,1,2
c,3,4,5
e,6,7,8
f,9,10,11
h,12,13,14


In [67]:
df = df_init.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df.set_value('a', 'one', np.NaN)
df.set_value('f', 'one', np.NaN)
df.set_value('h', 'one', np.NaN)
df.set_value('b', 'three', 0)
n_cols, n_rows = df.shape
df = df.T
df

Unnamed: 0,a,b,c,d,e,f,g,h
one,,,3.0,,6.0,,,
two,1.0,,4.0,,7.0,10.0,,13.0
three,2.0,0.0,5.0,,8.0,11.0,,14.0


**2. Find missing data**

In [68]:
# True if and only if is missing (for the opposite use df.notnull())
df.isnull()

Unnamed: 0,a,b,c,d,e,f,g,h
one,True,True,False,True,False,True,True,True
two,False,True,False,True,False,False,True,False
three,False,False,False,True,False,False,True,False


In [69]:
# for each column, True if and only if at least one missing data  
# (for row use axis=0 )
df.isnull().any()

a     True
b     True
c    False
d     True
e    False
f     True
g     True
h     True
dtype: bool

In [70]:
# for each column, True if and only if all missing data  
# (for row use axis=0 )
df.isnull().all()

a    False
b    False
c    False
d     True
e    False
f    False
g     True
h    False
dtype: bool

**3. Find indicator of missing data**

In [71]:
df

Unnamed: 0,a,b,c,d,e,f,g,h
one,,,3.0,,6.0,,,
two,1.0,,4.0,,7.0,10.0,,13.0
three,2.0,0.0,5.0,,8.0,11.0,,14.0


In [72]:
# number of missing per row
df.isnull().sum(axis=1)

one      6
two      3
three    2
dtype: int64

In [73]:
# proportion of missing data
df.isnull().sum(axis=1)/n_cols

one      0.750
two      0.375
three    0.250
dtype: float64

In [74]:
# gap is a set a consecutive missing data
def count_length_gaps(series):
    groups = groupby(series.isnull())
    return [len(list(group)) for label, group in groups if label] 

In [75]:
# gaps per row
length_gap = df.apply(count_length_gaps, axis=1)
length_gap

one      [2, 1, 3]
two      [1, 1, 1]
three       [1, 1]
dtype: object

In [76]:
# number of gaps per row
length_gap.apply(len)

one      3
two      3
three    2
dtype: int64

In [77]:
# mean of length gaps per row
length_gap.apply(np.mean)

one      2.0
two      1.0
three    1.0
dtype: float64

In [78]:
# number of missing data per row
length_gap.apply(np.sum)

one      6
two      3
three    2
dtype: int64

In [79]:
# max of length gaps per row
length_gap.apply(np.max)

one      3
two      1
three    1
dtype: int64

In [80]:
# min of length gaps per row
length_gap.apply(np.min)

one      1
two      1
three    1
dtype: int64

**4. Delete missing data**

In [81]:
df

Unnamed: 0,a,b,c,d,e,f,g,h
one,,,3.0,,6.0,,,
two,1.0,,4.0,,7.0,10.0,,13.0
three,2.0,0.0,5.0,,8.0,11.0,,14.0


In [82]:
# drop all columns where there is a missing data
df.dropna(axis=1)

Unnamed: 0,c,e
one,3.0,6.0
two,4.0,7.0
three,5.0,8.0


In [83]:
# drop all columns where there is only missing data
df.dropna(axis=1, how='all')

Unnamed: 0,a,b,c,e,f,h
one,,,3.0,6.0,,
two,1.0,,4.0,7.0,10.0,13.0
three,2.0,0.0,5.0,8.0,11.0,14.0


**5. Replace missing data (Imputation)**

In [84]:
df

Unnamed: 0,a,b,c,d,e,f,g,h
one,,,3.0,,6.0,,,
two,1.0,,4.0,,7.0,10.0,,13.0
three,2.0,0.0,5.0,,8.0,11.0,,14.0


> See http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html  
> Example: 
http://scikit-learn.org/stable/auto_examples/missing_values.html#sphx-glr-auto-examples-missing-values-py

In [85]:
# use this strategy per row
imp = sklearn.preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=1)
imp.fit(df.values)
imp.transform(df.values)

array([[  4.5       ,   4.5       ,   3.        ,   4.5       ,
          6.        ,   4.5       ,   4.5       ,   4.5       ],
       [  1.        ,   7.        ,   4.        ,   7.        ,
          7.        ,  10.        ,   7.        ,  13.        ],
       [  2.        ,   0.        ,   5.        ,   6.66666667,
          8.        ,  11.        ,   6.66666667,  14.        ]])