In [1]:
import numpy as np
import pandas as pd

In [4]:
# create a simple dataframe
df=pd.DataFrame({
    'name':['Jane','John','Ashley','Mike','Emily','Jack','Catlin'],
    'ctg':['A','A','C','B','B','C','B'],
    'val':np.random.random(7).round(2),
    'val2':np.random.randint(1,10,size=7)
})
df

Unnamed: 0,name,ctg,val,val2
0,Jane,A,0.97,7
1,John,A,0.43,9
2,Ashley,C,0.99,2
3,Mike,B,0.36,7
4,Emily,B,0.91,1
5,Jack,C,0.47,1
6,Catlin,B,0.79,8


# 1 Logical Operators

In [5]:
# Select rows in which val columns is greater than 0.5
df[df['val']>0.5]

Unnamed: 0,name,ctg,val,val2
0,Jane,A,0.97,7
2,Ashley,C,0.99,2
4,Emily,B,0.91,1
6,Catlin,B,0.79,8


In [6]:
# Logical operators work on strings as well
df[df['name']>'J']

Unnamed: 0,name,ctg,val,val2
0,Jane,A,0.97,7
1,John,A,0.43,9
3,Mike,B,0.36,7
5,Jack,C,0.47,1


# 2 Multiple Logical Operators

In [9]:
# & for 'and' and | for 'or'
df[(df['val']>0.5) & (df['val2']==1)]

Unnamed: 0,name,ctg,val,val2
4,Emily,B,0.91,1


# 3 Isin

In [13]:
# The isin method is another way of applying multiple condition for filtering. 
# For instance, we can filter the names 
# that exist in a given list.
names=['John','Catlin','Mike']
df[df['name'].isin(names)]

Unnamed: 0,name,ctg,val,val2
1,John,A,0.43,9
3,Mike,B,0.36,7
6,Catlin,B,0.79,8


# 4 Str accessor

In [14]:
# str allows us to filter rows based on strings
df[df['name'].str.startswith('A')]

Unnamed: 0,name,ctg,val,val2
2,Ashley,C,0.99,2


In [15]:
df[df['name'].str.contains('t')]

Unnamed: 0,name,ctg,val,val2
6,Catlin,B,0.79,8


# 5 Tile(~)

In [18]:
# It is used for NOT filtering 
# IF we ~ before the filter expression we get rows that 
# donot follow the condition
# in below case we get names that DON'T start with 'J'
df[~df['name'].str.startswith('J')]

Unnamed: 0,name,ctg,val,val2
2,Ashley,C,0.99,2
3,Mike,B,0.36,7
4,Emily,B,0.91,1
6,Catlin,B,0.79,8


# 6 Query

In [19]:
# This one offers great flexibility at filtering
df.query('ctg=="B" and val>0.5')

Unnamed: 0,name,ctg,val,val2
4,Emily,B,0.91,1
6,Catlin,B,0.79,8


# 7 Nlargest or nsmallest

In [21]:
# gives data having top 3 largest val value
df.nlargest(3,'val')

Unnamed: 0,name,ctg,val,val2
2,Ashley,C,0.99,2
0,Jane,A,0.97,7
4,Emily,B,0.91,1


In [22]:
df.nsmallest(2,'val')

Unnamed: 0,name,ctg,val,val2
3,Mike,B,0.36,7
1,John,A,0.43,9


# 8 Loc and iloc

In [23]:
# loc: select rows or columns using labels
# iloc : select rows or columns using indices
# gets row 3 and 4 with all columns
df.iloc[3:5, :]

Unnamed: 0,name,ctg,val,val2
3,Mike,B,0.36,7
4,Emily,B,0.91,1


In [None]:
# if dataframe has integer index then loc and iloc accomplish 
# same thing