In [1]:
import numpy as np

In [2]:
import pandas as pd

In [4]:
from numpy.random import randn

In [6]:
np.random.seed(101)

In [7]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [8]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
#Let us see conditional select with dataframes

In [10]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
#Here we see that whenever the condition is satisfied the number is returned. Else, NaN or null is returned

In [11]:
#Now, we dont use conditional select in this way. Instead what we commonly use is to select only those rows
#for which the value is true, i.e.,
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [None]:
#The above returns only rows A,B,D,E as the condition df['W']>0 is true only for these

In [14]:

df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [15]:
#111ly we could try
df[df['Y']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [16]:
#The resultant is again a dataset from which we could fetch specific values, like,
df[df['Y']>0]['X']

A    0.628133
C    0.740122
E    1.978757
Name: X, dtype: float64

In [18]:
#or even we could fetch a list from the resultant, i.e.,
df[df['Y']>0][['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
C,0.740122,-0.589001
E,1.978757,0.683509


In [19]:
#If the above one liner is confusing we could always break this down into simpler steps, like,
boolser=df['Y']>0

In [21]:
result = df[boolser]

In [25]:
columns = result[['X','Z']]

In [26]:
columns

Unnamed: 0,X,Z
A,0.628133,0.503826
C,0.740122,-0.589001
E,1.978757,0.683509


In [None]:
#The only problem with the second approach is that when we have lot of intermediate steps/vars it consumes more
#memory. So through out we are going to use a lot of one liners

In [29]:
df[(df['Y']>0) and (df['X']>0)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
#In the above example where we are trying to do 'and' of two conditions, it fails because each of the condition
#returns a series and thus 'and' operator gets confused. So, we need to use either '&' or '|' for clubbing multiple
#conditions in a df, i.e.,

In [30]:
df[(df['Y']>0) & (df['X']>0)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001
E,0.190794,1.978757,2.605967,0.683509


In [None]:
#DataFrame Indexes
#We can reset the index of a df using df.reset_indes()

In [31]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [32]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [None]:
#Now the indexes are reset to numbers and the original indexes (A,B,C,D,E) has become a new column

In [None]:
#Similarly, we have set_index() to add a new column
#Let us first create a new column

In [36]:
df['States']=['KA','AN','MH','BR','RJ']
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,KA
B,0.651118,-0.319318,-0.848077,0.605965,AN
C,-2.018168,0.740122,0.528813,-0.589001,MH
D,0.188695,-0.758872,-0.933237,0.955057,BR
E,0.190794,1.978757,2.605967,0.683509,RJ


In [35]:
df.set_index(df['States'])

Unnamed: 0_level_0,W,X,Y,Z,States
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KA,2.70685,0.628133,0.907969,0.503826,KA
AN,0.651118,-0.319318,-0.848077,0.605965,AN
MH,-2.018168,0.740122,0.528813,-0.589001,MH
BR,0.188695,-0.758872,-0.933237,0.955057,BR
RJ,0.190794,1.978757,2.605967,0.683509,RJ


In [None]:
#We can see that the new column STates is now added to df