In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame({'col1': [1, 2, 3, 3], 'col2': ['a', np.NaN, 'a', 'a'], 'col3': ['A', 'B', 'C', 'C']})
df1

Unnamed: 0,col1,col2,col3
0,1,a,A
1,2,,B
2,3,a,C
3,3,a,C


## Select interested columns

In [3]:
df1_interests = df1.loc[:, ['col1', 'col3']]
df1_interests

Unnamed: 0,col1,col3
0,1,A
1,2,B
2,3,C
3,3,C


## Select rows that `col2` starts with `a`

In [4]:
df1_starts_with_a = df1[df1['col2'].str.startswith('a', na=False)]
df1_starts_with_a

Unnamed: 0,col1,col2,col3
0,1,a,A
2,3,a,C
3,3,a,C


## Select rows that `col3` does not contains `A`

In [5]:
df1_not_contain_A = df1[df1['col3'].str.contains('A') == False]
df1_not_contain_A

Unnamed: 0,col1,col2,col3
1,2,,B
2,3,a,C
3,3,a,C


## Drop duplicate rows

In [6]:
df1.drop_duplicates()

Unnamed: 0,col1,col2,col3
0,1,a,A
1,2,,B
2,3,a,C


## Drop `nan` elements in `col2`

In [7]:
df1_drop_nan = df1[pd.notnull(df1['col2'])]
df1_drop_nan

Unnamed: 0,col1,col2,col3
0,1,a,A
2,3,a,C
3,3,a,C


## Exclude those group numbers is less than 2 in `col2`

In [8]:
df1_group_number_less_than_2 = df1.groupby(by=['col2']).filter(lambda x: len(x) >= 2)
df1_group_number_less_than_2

Unnamed: 0,col1,col2,col3
0,1,a,A
2,3,a,C
3,3,a,C


## Apply function to each row

In [9]:
def build_col4(row):
    """combine col2 and col3 as col4"""
    return row['col2'] + row['col3'] if row['col2'] is not np.NAN else np.NAN

df1['col4'] = df1.apply(build_col4, axis=1)
df1

Unnamed: 0,col1,col2,col3,col4
0,1,a,A,aA
1,2,,B,
2,3,a,C,aC
3,3,a,C,aC


## Concat two DataFrame with `axis=0` (Operate on rows)

In [10]:
df1 = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', np.NaN, 'a'], 'col3': ['A', 'B', 'C']})
df2 = pd.DataFrame({'col1': [100, 200, 300], 'col2': ['x', 'y', 'z'], 'col3': ['X', 'Y', 'Z']})
df3 = pd.concat([df1, df2], axis=0)
df3

Unnamed: 0,col1,col2,col3
0,1,a,A
1,2,,B
2,3,a,C
0,100,x,X
1,200,y,Y
2,300,z,Z


## Merge two DataFrame to expand columns

In [11]:
df1 = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', np.NaN, 'a'], 'col4': ['A1', 'B2', 'C2']})
df2 = pd.DataFrame({'col1': [1, 2, 3], 'col2': ['a', np.NaN, 'a'], 'col5': ['A2', 'B2', 'C2']})
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,col1,col2,col4,col5
0,1,a,A1,A2
1,2,,B2,B2
2,3,a,C2,C2
