## Logic in Python (and pandas)
* https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

In [1]:
import pandas as pd
import numpy as np
df = pd.DataFrame(
        {"a" : [4 ,5, 6, 6, np.nan],
        "b" : [7, 8, np.nan, 9, 9],
        "c" : [10, 11, 12, np.nan, 12]},
        index = pd.MultiIndex.from_tuples(
        [('d',1),('d',2),('e',2),('e',3),('e',4)],
        names=['n','v']))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [2]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [3]:
df[df["b"] != 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5.0,8.0,11.0
e,2,6.0,,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [4]:
df['a'].isin([5])

n  v
d  1    False
   2     True
e  2    False
   3    False
   4    False
Name: a, dtype: bool

In [5]:
pd.isnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,False,False,False
d,2,False,False,False
e,2,False,True,False
e,3,False,False,True
e,4,True,False,False


In [6]:
df['a'].isnull().sum()

1

In [7]:
pd.notnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,True,True,True
d,2,True,True,True
e,2,True,False,True
e,3,True,True,False
e,4,False,True,True


In [8]:
df.notnull().sum()

a    4
b    4
c    4
dtype: int64

In [9]:
df.a.notnull()

n  v
d  1     True
   2     True
e  2     True
   3     True
   4    False
Name: a, dtype: bool

* &,|,~,^,df.any(),df.all() 
* and, or, not, xor, any, all

In [10]:
df[(df.b == 7) | (df.a == 5)]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0


In [11]:
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0


In [12]:
df.sample(frac=0.3)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,6.0,9.0,
d,1,4.0,7.0,10.0


In [13]:
df.sample(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,4,,9.0,12.0
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,3,6.0,9.0,
e,2,6.0,,12.0


In [14]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,8.0,11.0
e,2,6.0,,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [15]:
df.iloc[-2:]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,6.0,9.0,
e,4,,9.0,12.0


In [16]:
df = pd.DataFrame({'a': [1, 10, 8, 11, -1],
                'b': list('abdce'),
                'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
df

Unnamed: 0,a,b,c
0,1,a,1.0
1,10,b,2.0
2,8,d,
3,11,c,3.0
4,-1,e,4.0


In [17]:
df.nlargest(1, 'a')

Unnamed: 0,a,b,c
3,11,c,3.0


In [18]:
df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
4,-1,e,4.0
0,1,a,1.0
2,8,d,


In [19]:
df

Unnamed: 0,a,b,c
0,1,a,1.0
1,10,b,2.0
2,8,d,
3,11,c,3.0
4,-1,e,4.0


In [20]:
# Return whether all elements are True, potentially over an axis.
# all, any 기능을 알아보기 위해 d, e 컬럼을 만들고 
# d에는 모두 True를 e에는 False 를 f에는 하나만 False를 생성
df["d"] = True
df["e"] = False
df["f"] = True
df.loc[1, "f"] = False
df

Unnamed: 0,a,b,c,d,e,f
0,1,a,1.0,True,False,True
1,10,b,2.0,True,False,False
2,8,d,,True,False,True
3,11,c,3.0,True,False,True
4,-1,e,4.0,True,False,True


In [21]:
# Return whether all elements are True, potentially over an axis.
# True, False 형태로 되어있는 컬럼에서 True, False 여부를 확인
# df.all()은 모두 True 일때만 True로 출력
df.all()

a     True
b     True
c     True
d     True
e    False
f    False
dtype: bool

In [22]:
# Return whether any element is True, potentially over an axis.
# df.any()는 하나라도 True 라면 True로 출력
df.any()

a     True
b     True
c     True
d     True
e    False
f     True
dtype: bool

In [23]:
# df.any()는 하나라도 False 라면 True로 출력
df.any()

a     True
b     True
c     True
d     True
e    False
f     True
dtype: bool

In [24]:
# 응용 : 결측치 여부를 True(결측치아님) False(결측치)로 구함
df.notnull()

Unnamed: 0,a,b,c,d,e,f
0,True,True,True,True,True,True
1,True,True,True,True,True,True
2,True,True,False,True,True,True
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [25]:
# notnull 로 결측치 여부를 확인하고 결측치가 하나라도 있는 컬럼을 찾음
df.notnull().all()

a     True
b     True
c    False
d     True
e     True
f     True
dtype: bool

In [26]:
df.notnull().any()

a    True
b    True
c    True
d    True
e    True
f    True
dtype: bool

In [27]:
# isnull로 결측치 여부를 확인하고 결측치가 하나라도 있는 컬럼을 찾음
df.isnull().any()

a    False
b    False
c     True
d    False
e    False
f    False
dtype: bool