## 여러가지 선언 방법

In [1]:
import pandas as pd

df = pd.DataFrame(
    {"a" : [4 ,5, 6],
    "b" : [7, 8, 9],
    "c" : [10, 11, 12]}, index = [1, 2, 3])

df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [2]:
df.loc[2]

a     5
b     8
c    11
Name: 2, dtype: int64

In [3]:
df.loc[2, 'a']

5

In [4]:
df.loc[[1,3], ['a', 'c']]

Unnamed: 0,a,c
1,4,10
3,6,12


In [5]:
df = pd.DataFrame(
    [
        [4, 7, 10], 
         [5, 8, 11],
         [6, 9, 12]
    ],
    index=[1, 2, 3], 
    columns=['a', 'b', 'c'])

df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [6]:
df = pd.DataFrame(
    { 
        "a" : [4,5,6,6],
        "b" : [7,8,9,9],
        "c" : [10,11,12,12]
    },
    index=pd.MultiIndex.from_tuples(
        [
            ('d', 1),
            ('d', 2),
            ('e', 2),
            ('e', 3)
        ],
        names=['n', 'v']
    )
)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


## 조건으로 색인

In [7]:
df[df.b > 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [8]:
df.b > 7

n  v
d  1    False
   2     True
e  2     True
   3     True
Name: b, dtype: bool

## 중복 제거

In [9]:
df.drop_duplicates()
df
# 이후 df를 다시 확인해보면 중복 제거 전 데이터가 그대로 있음.
# df.drop_duplicates(inplace=True) # 권장하지 않는 방법.
# => df = df.drop_duplicates 로 변수에 담자.

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12
e,3,6,9,12


In [10]:
df = df.drop_duplicates(keep='first')  # 중복된 값 중, 첫번째 항목을 남긴다 (last : 마지막 항목을 남긴다)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
d,2,5,8,11
e,2,6,9,12


## 논리 연산

In [11]:
df[df.b != 8]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
e,2,6,9,12


In [12]:
df[df['a'].isin([5,6])]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5,8,11
e,2,6,9,12


In [13]:
import numpy as np

df = pd.DataFrame(
    { 
        "a" : [4,5,6,6,np.nan],
        "b" : [7,np.nan,8,9,9],
        "c" : [10,11,12,np.nan,12]
    },
    index=pd.MultiIndex.from_tuples(
        [
            ('d', 1),
            ('d', 2),
            ('e', 2),
            ('e', 3),
            ('e', 4)
        ],
        names=['n', 'v']
    )
)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,,11.0
e,2,6.0,8.0,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


In [14]:
pd.isnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,False,False,False
d,2,False,True,False
e,2,False,False,False
e,3,False,False,True
e,4,True,False,False


In [15]:
df[df['b'].isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5.0,,11.0


In [16]:
pd.notnull(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,True,True,True
d,2,True,False,True
e,2,True,True,True
e,3,True,True,False
e,4,False,True,True


In [17]:
df[df['a'].notnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,,11.0
e,2,6.0,8.0,12.0
e,3,6.0,9.0,


In [18]:
df[df['a'].notnull()].sum()

a    21.0
b    24.0
c    33.0
dtype: float64

In [19]:
df[df['a'] == 5] & df[df['b'] == 7]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,,,
d,2,,,


## Head, Tail

In [20]:
df.tail(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,2,6.0,8.0,12.0
e,3,6.0,9.0,
e,4,,9.0,12.0


## Sampling

In [21]:
df.sample(frac=0.5)    # 임의의 순서로 특정 비율로 샘플링하는 방법, frac=1 이면 전부 가져옴

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,2,5.0,,11.0
e,2,6.0,8.0,12.0


In [22]:
df.sample(n=3)    # 임의의 순서로 특정 갯수(n)만큼 샘플링

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
e,4,,9.0,12.0
e,2,6.0,8.0,12.0


In [23]:
df.iloc[:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
d,2,5.0,,11.0
e,2,6.0,8.0,12.0
e,3,6.0,9.0,


In [24]:
df.iloc[2:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,2,6.0,8.0,12.0
e,3,6.0,9.0,


In [25]:
df.iloc[-1:]

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,4,,9.0,12.0


In [26]:
df.iloc?

In [27]:
df.nlargest?

## nlargest, nsmallest

In [28]:
df.nlargest(3, 'c', keep='last')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
e,4,,9.0,12.0
e,2,6.0,8.0,12.0
d,2,5.0,,11.0


In [29]:
df.nsmallest(2, 'b')

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4.0,7.0,10.0
e,2,6.0,8.0,12.0
