In [49]:
import numpy as np
import pandas as pd

df = pd.DataFrame(
    {"a":[i for i in range(0, 10)], 
     "b":[i for i in range(5, 15)]
    }
)
print(df['a']>5)
print()
print(df[df['a']>5])
print()
print(df[~(df['a']>5)]) # tilde operator is 여집합의 개념
print()
"""
- 그냥 boolean 값을 리스트로 넣어서 처리해줘도 상관없긴 함. 
- 그러나 이 경우에는 ~ operator 를 사용할 수 없다는 단점이 있으니, 가급적이면 np.array()의 형태로 만든 다음 사용하는 것이 좋음. 
"""
bool_lst = np.array([False, False, False, False, False, False, True, True, True, True])
try:
    print()
    print(df[~bool_lst])
except TypeError as e:
    print()
    print("erorr occurred, error message is {}".format(e))

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
Name: a, dtype: bool

   a   b
6  6  11
7  7  12
8  8  13
9  9  14

   a   b
0  0   5
1  1   6
2  2   7
3  3   8
4  4   9
5  5  10


erorr occurred, error message is bad operand type for unary ~: 'list'

   a   b
0  0   5
1  1   6
2  2   7
3  3   8
4  4   9
5  5  10


In [35]:
import numpy as np
import pandas as pd

sample_size = 100
"""분포별로 랜덤하게 값을 만들어줍니다. 
"""
test_dist_dict = {
    "uniform":np.random.randint(1, 100, sample_size),
    "norm_10_5":np.random.normal(10,5, sample_size),
    "norm_0_1":np.random.normal(0,1, sample_size),
    "exp":np.random.exponential(1, sample_size), 
    "poisson":np.random.poisson(10, sample_size)
}
"""append outlier in each columns: max*2
"""
for k in test_dist_dict.keys():
    l = list(test_dist_dict[k])
    l.append(max(l)*2)
    test_dist_dict[k] = np.array(l)
test_dist_df = pd.DataFrame(test_dist_dict)

"""각 칼럼이 norm(0, 1)을 따른다고 가정하고 standardization해줍니다. 
"""
from sklearn import preprocessing
new_X = pd.DataFrame(preprocessing.scale(test_dist_df, axis=0), 
                     index=test_dist_df.index, columns=test_dist_df.columns)
print(new_X.head())
"""
현재 데이터는 모두 norm(0, 1)을 따르는 형태로 변환된 상황입니다. 
"""
np.all(new_X >=2, axis=0)

        exp  norm_0_1  norm_10_5   poisson   uniform
0 -0.577388  0.786163  -0.018351 -0.254162  0.211285
1 -0.268417 -0.614713  -0.446385 -0.036617  0.305293
2 -0.440451 -0.279165  -0.746177 -1.341887  0.900676
3 -0.503357 -1.318059   0.378131  1.051109 -0.164747
4 -0.562727 -0.256175   0.139560  2.138835  0.869340


array([False, False, False, False, False], dtype=bool)

In [26]:
print("out of 2 sigma")
print(new_X[ 
    np.logical_or(np.any(new_X >= 2, axis=1), np.any(new_X <= -2, axis=1)) 
])
print()
print("inside of 2 sigma")
print(new_X[ 
    np.logical_and(np.all(new_X <= 2, axis=1), np.all(new_X >= -2, axis=1)) 
].head())

        exp  norm_0_1  norm_10_5   poisson   uniform
0  0.467419  0.072519  -0.255446  0.106474  1.382747
1  0.297859 -0.214551   0.102819 -0.137932  0.925175
2  0.134533  1.333550  -0.648849 -0.871147 -0.186071
3  2.337668  0.491843  -0.124735 -0.137932  0.532971
4 -0.658539  0.181512   0.576872  0.595284 -1.264632
out of 2 sigma
          exp  norm_0_1  norm_10_5   poisson   uniform
3    2.337668  0.491843  -0.124735 -0.137932  0.532971
5    2.134955  0.940126  -1.307098  1.572905  1.252012
30  -0.271442 -2.222816   0.469963  1.084094  0.336869
72  -0.606971 -1.328234  -2.177983  0.106474 -1.133898
84   3.390007  0.167918   0.620680  0.106474  0.140766
99   0.329987  2.472778   0.184584  1.328500 -1.199265
100  7.474420  5.043572   5.189969  6.216604  4.945269

inside of 2 sigma
        exp  norm_0_1  norm_10_5   poisson   uniform
0  0.467419  0.072519  -0.255446  0.106474  1.382747
1  0.297859 -0.214551   0.102819 -0.137932  0.925175
2  0.134533  1.333550  -0.648849 -0.871147 -0.186