In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn

In [3]:
np.random.seed(101)

In [4]:
# Create a sample data frame
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [6]:
# Conditional Selection
# We may apply conditional statements on pandas data frames as numpy arrays
# If we apply condition on entire data frames we get null('Nan') at place where condition is false.
# E.g. 
df > 0 # Will return a data frame of booleans. 

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [7]:
# Now, if we may pass this boolean df to one we prepared, Will Get NaN values where we have False in boolean data frames
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [8]:
# But if we apply condition on a specific column then will get only values where conditions is True
# E.g. 
df[df['W']>0]  # We can see now row 'C' where condition was False is no more part of result set

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [10]:
# This can we used to grab a subset of dataframe
# df[df['W']>0] returns a data frame as result, we can then use this result data framto get specific subset
# E.g. 
df[df['W']>0][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


In [11]:
# one more example for rows
df[df['W']>0].loc[['B', 'D']]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [12]:
# We can put multiple conditions with pandas data frames
# To use multiple conditions with python data frames, if have use '&' between multiple conditions, in stead of 'and' 
# E.g. 
df[(df['W'] > 0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [13]:
# We may use '|' to perform an 'OR' operation
# E.g.
df[(df['W'] > 0) | (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [14]:
# Reset the index
# E.g.
df.reset_index()   # The actual index i.e. row names will be take into a column named 'index' 
                   # and a sequence nmber will be placed as index
    
# NOTE: This changes is not inplace

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [15]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [16]:
# Lets create a new column
States = "CA FL NY WA PA".split()

In [17]:
df['states'] = States

In [18]:
df

Unnamed: 0,W,X,Y,Z,states
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,FL
C,-2.018168,0.740122,0.528813,-0.589001,NY
D,0.188695,-0.758872,-0.933237,0.955057,WA
E,0.190794,1.978757,2.605967,0.683509,PA


In [19]:
""" Now We may set this new column as new index for the df, 
Please note, this will overwrite your previos index and you won't be able to retain it
"""
df.set_index('states')   # This change is not inplace  as well

Unnamed: 0_level_0,W,X,Y,Z
states,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
FL,0.651118,-0.319318,-0.848077,0.605965
NY,-2.018168,0.740122,0.528813,-0.589001
WA,0.188695,-0.758872,-0.933237,0.955057
PA,0.190794,1.978757,2.605967,0.683509


In [20]:
df

Unnamed: 0,W,X,Y,Z,states
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,FL
C,-2.018168,0.740122,0.528813,-0.589001,NY
D,0.188695,-0.758872,-0.933237,0.955057,WA
E,0.190794,1.978757,2.605967,0.683509,PA
