In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')
df

Unnamed: 0,A,B,C,D
0,0.187497,1.12215,-0.988277,-1.985934
1,0.360803,-0.562243,-0.340693,-0.986988
2,-0.040627,0.067333,-0.452978,0.686223
3,-0.279572,-0.702492,0.252265,0.958977
4,0.537438,-1.737568,0.714727,-0.939288
5,0.070011,-0.516443,-1.655689,0.246721
6,0.001268,0.951517,2.10736,-0.108726
7,-0.185258,0.85652,-0.686285,1.104195
8,0.387023,1.706336,-2.452653,0.260466
9,-1.054974,0.556775,-0.945219,-0.030295


# Filtering the data

In [2]:
df[df.A > 0.2] # or df[df['A'] > 0.2]

Unnamed: 0,A,B,C,D
1,0.360803,-0.562243,-0.340693,-0.986988
4,0.537438,-1.737568,0.714727,-0.939288
8,0.387023,1.706336,-2.452653,0.260466


In [3]:
df.loc[df.A > 0.2, ['A', 'D']] # specify the selected columns

Unnamed: 0,A,D
1,0.360803,-0.986988
4,0.537438,-0.939288
8,0.387023,0.260466


In [4]:
# multiple condition
df[(df.A > 0) & (df.A < 0.5)]
# & and
 
# df[(df.A>0.5) | (df.A < 0)]
# | or

# df[~(df.A > 0)]
# ~ not

Unnamed: 0,A,B,C,D
0,0.187497,1.12215,-0.988277,-1.985934
1,0.360803,-0.562243,-0.340693,-0.986988
5,0.070011,-0.516443,-1.655689,0.246721
6,0.001268,0.951517,2.10736,-0.108726
8,0.387023,1.706336,-2.452653,0.260466


### Filtering: String & Regex

`df[columns].str.contains(str_or_regex, case=True)`  
`df[columns].str.startswith()`  
`df[columns].isin(options)`

### Query Function
`.query`

In [5]:
df.query('A>0.5')

Unnamed: 0,A,B,C,D
4,0.537438,-1.737568,0.714727,-0.939288


In [6]:
df.query('A>0 and B<0')

Unnamed: 0,A,B,C,D
1,0.360803,-0.562243,-0.340693,-0.986988
4,0.537438,-1.737568,0.714727,-0.939288
5,0.070011,-0.516443,-1.655689,0.246721


# `.apply()` 
input is a `Series`  
- `axis=0` (default) -> apply to the column  
  -> với `DataFrame` -> apply lên từng cột( mỗi cọt là 1 series, đưa vào từng cột)  
  -> vơi `Series` -> apply lên từng phần tử
- `axis=1` -> apply to the row (mỗi hàng là 1 series, đưa vào từng hàng)  
  -> `axis=1` chỉ áp dụng được với `DataFrame`

In [7]:
df[['A', 'B']].apply(np.sum) # sum(A), sum(B), sum(C), sum(D) -> lần lượt tất cả column
# apply trên dataframe, axis=0 -> mỗi cột được đưa vào lần lượt

A   -0.016391
B    1.741885
dtype: float64

In [8]:
df.A.apply(np.sum) # apply lên column A
# df.A -> series
# đưa từng phần tử của cột A vào rồi tính

0    0.187497
1    0.360803
2   -0.040627
3   -0.279572
4    0.537438
5    0.070011
6    0.001268
7   -0.185258
8    0.387023
9   -1.054974
Name: A, dtype: float64

In [9]:
df.apply(np.sum, axis=1) # sum(index[0]), sum(index[1]), sum(index[2]), ...
# đưa lần lượt từng hàng vào

0   -1.664564
1   -1.529121
2    0.259951
3    0.229178
4   -1.424691
5   -1.855400
6    2.951419
7    1.089172
8   -0.098828
9   -1.473713
dtype: float64

In [10]:

sqrt = lambda x: x**(1/2) if x > 0 else x
# sqrt chỉ có thể nhận từng phần tử 

# df.apply(sqrt) # Error-> x là từng cột -> không thể áp dụng
# -> cần apply lần lượt ở từng cột (khi này từng phần tử sẽ được truyền vào thay vì cả cột được truyền vào)
for col in df.columns:
    df[col] = df[col].apply(sqrt)
df

Unnamed: 0,A,B,C,D
0,0.433009,1.059316,-0.988277,-1.985934
1,0.600669,-0.562243,-0.340693,-0.986988
2,-0.040627,0.259486,-0.452978,0.828386
3,-0.279572,-0.702492,0.50226,0.979274
4,0.733102,-1.737568,0.845415,-0.939288
5,0.264596,-0.516443,-1.655689,0.49671
6,0.035609,0.975457,1.451675,-0.108726
7,-0.185258,0.925484,-0.686285,1.050807
8,0.622112,1.306268,-2.452653,0.510359
9,-1.054974,0.746174,-0.945219,-0.030295


In [11]:
# apply nhiều hàm cùng lúc ?
df.apply(lambda x: {
    "sum": np.sum(x),
    "mean": np.mean(x)
})

A    {'sum': 1.1286652940333695, 'mean': 0.11286652...
B    {'sum': 1.753438372047186, 'mean': 0.175343837...
C    {'sum': -4.722443929813905, 'mean': -0.4722443...
D    {'sum': -0.18569580252408316, 'mean': -0.01856...
dtype: object

In [12]:
df.apply(lambda x: {
    "sum": np.sum(x),
    "mean": np.mean(x)
}, axis=1)

0    {'sum': -1.4818859452923432, 'mean': -0.370471...
1    {'sum': -1.2892552060710996, 'mean': -0.322313...
2    {'sum': 0.5942668101469691, 'mean': 0.14856670...
3    {'sum': 0.49946960334512436, 'mean': 0.1248674...
4    {'sum': -1.0983390873694239, 'mean': -0.274584...
5    {'sum': -1.4108259045990779, 'mean': -0.352706...
6    {'sum': 2.354015211004485, 'mean': 0.588503802...
7    {'sum': 1.1047474902051626, 'mean': 0.27618687...
8    {'sum': -0.013914609927937138, 'mean': -0.0034...
9    {'sum': -1.284314427699292, 'mean': -0.3210786...
dtype: object