## Missing Data

In [2]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [3]:
data= Series(['One','Two',np.nan,'Three'])

In [4]:
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
# droppping null values
data.dropna()

0      One
1      Two
3    Three
dtype: object

In [6]:
df1= DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])

In [7]:
df1

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [8]:
#this will drop any row with a null in it
clean_df1=df1.dropna()

In [9]:
clean_df1

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [10]:
#we can specify so that only rows with all null be dropped
df1.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [12]:
#dropping columns
df1.dropna(axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [14]:
# only rows with at least three data points
npn = np.nan
df2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]])

In [15]:
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [18]:
# to specify only rows that have at least two data point for example
df2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [19]:
df2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [20]:
# to fill null values

df2.fillna(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,9.0
3,1.0,1.0,1.0,1.0


In [24]:
# or we can specify how to fill null for each column
# pass arg (inplace=True) to replace original df
df2.fillna({0:0,1:1,2:2,3:3})

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,2.0,1.0,5.0,6.0
2,0.0,7.0,2.0,9.0
3,1.0,1.0,2.0,3.0


## Index Hierarchy

In [25]:
from numpy.random import randn

In [26]:
# creating series with multiple indices

ser = Series(randn(6), index=[[1,1,1,2,2,2],['a','b','c','a','b','c']])

In [27]:
ser

1  a    0.747131
   b   -2.658663
   c    0.007770
2  a    0.533293
   b    0.385740
   c    0.131862
dtype: float64

In [28]:
ser.index

MultiIndex(levels=[[1, 2], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [29]:
ser[1]

a    0.747131
b   -2.658663
c    0.007770
dtype: float64

In [30]:
ser[2]

a    0.533293
b    0.385740
c    0.131862
dtype: float64

In [31]:
#calling index from internal index level
ser[:,'a']

1    0.747131
2    0.533293
dtype: float64

In [32]:
#creating df from multilevel index, will assign internal index to columns
df1=ser.unstack()

In [33]:
df1

Unnamed: 0,a,b,c
1,0.747131,-2.658663,0.00777
2,0.533293,0.38574,0.131862


In [37]:
# creating df with multilevel index

df2=DataFrame(np.arange(16).reshape(4,4), index=[['a','a','b','b'],[1,2,1,2]],
             columns=[['NY','NY','LA','SF'],['cold','hot','hot','cold']])

In [38]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,NY,NY,LA,SF
Unnamed: 0_level_1,Unnamed: 1_level_1,cold,hot,hot,cold
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [44]:
# naming index levels

df2.index.names = ['ID1','ID2']

df2.columns.names=['Cities','Temp']

In [45]:
df2

Unnamed: 0_level_0,Cities,NY,NY,LA,SF
Unnamed: 0_level_1,Temp,cold,hot,hot,cold
ID1,ID2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [46]:
#interchange level orders
df2.swaplevel('Cities','Temp',axis=1)

Unnamed: 0_level_0,Temp,cold,hot,hot,cold
Unnamed: 0_level_1,Cities,NY,NY,LA,SF
ID1,ID2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


In [51]:
df2.sort_index(1)

Unnamed: 0_level_0,Cities,LA,NY,NY,SF
Unnamed: 0_level_1,Temp,hot,cold,hot,cold
ID1,ID2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,1,2,0,1,3
a,2,6,4,5,7
b,1,10,8,9,11
b,2,14,12,13,15


In [52]:
# adding based on a certain index

df2.sum(level='Temp', axis=1)

Unnamed: 0_level_0,Temp,cold,hot
ID1,ID2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,3,3
a,2,11,11
b,1,19,19
b,2,27,27
