# 데이터 프레임 인덱스 지정 만들기 & 확인하기

In [17]:
import pandas as pd

df = pd.DataFrame(
    {'a' : [4, 5, 6, 7],
    'b' : [8, 9, 10, 11],
    'c' : [12, 13, 14, 15]},
    index = [1, 2, 3, 4] # 지정 안할경우 0, 1, 2, 3
)
df

Unnamed: 0,a,b,c
1,4,8,12
2,5,9,13
3,6,10,14
4,7,11,15


# 데이터 프레임 열 이름 지정 만들기 & 확인하기

In [18]:
df = pd.DataFrame(
    [
      [4, 8, 12],
      [5, 9, 13],
      [6, 10, 14],
      [7, 11, 15],
    ],
    index = [1, 2, 3, 4], # 지정 안할경우 0, 1, 2, 3
    columns = ['a', 'b', 'c'] # 지정 안할경우 0, 1, 2
)
df

Unnamed: 0,a,b,c
1,4,8,12
2,5,9,13
3,6,10,14
4,7,11,15


# 인덱스가 여러 개로 구성된 데이터 프레임 만들기

In [19]:
df = pd.DataFrame(
    {
        'z' : [4, 8, 7, 5, 8, 9],
        'bc' : [8, 8, 10, 11, 12, 13],
        'c' : [12, 13, 14, 15, 16, 17],
    },
    index = pd.MultiIndex.from_tuples(
        [('e', 3, 'g'), ('e', 1, 'z'), ('d', 3, 3), ('d', 2, 'b'), ('e', 1, 'f'), ('d', 4, 'b')],
        names = ['n', 'v', '?']
    )
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,3,g,4,8,12
e,1,z,8,8,13
d,3,3,7,10,14
d,2,b,5,11,15
e,1,f,8,12,16
d,4,b,9,13,17


# 데이터 정렬

### 1. 특정 열 값을 기준으로 정렬

In [20]:
df.sort_values('z', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d,4,b,9,13,17
e,1,z,8,8,13
e,1,f,8,12,16
d,3,3,7,10,14
d,2,b,5,11,15
e,3,g,4,8,12


### 2. 열 이름 변경

In [21]:
df.rename(columns={'c':'d'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,d
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,3,g,4,8,12
e,1,z,8,8,13
d,3,3,7,10,14
d,2,b,5,11,15
e,1,f,8,12,16
d,4,b,9,13,17


### 3. 인덱스 값 초기화하기

In [22]:
df.reset_index()

Unnamed: 0,n,v,?,z,bc,c
0,e,3,g,4,8,12
1,e,1,z,8,8,13
2,d,3,3,7,10,14
3,d,2,b,5,11,15
4,e,1,f,8,12,16
5,d,4,b,9,13,17


### 4. 인덱스 순서대로 정렬하기

In [23]:
df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d,2,b,5,11,15
d,3,3,7,10,14
d,4,b,9,13,17
e,1,f,8,12,16
e,1,z,8,8,13
e,3,g,4,8,12


### 5. 특정 행, 열 제거하기

In [24]:
print(df.sort_index().drop('e'))
print(df.sort_index().drop('z', axis = 1))

       z  bc   c
n v ?           
d 2 b  5  11  15
  3 3  7  10  14
  4 b  9  13  17
       bc   c
n v ?        
d 2 b  11  15
  3 3  10  14
  4 b  13  17
e 1 f  12  16
    z   8  13
  3 g   8  12


# 행 추출하기

### 1. 맨 위 행 출력

In [25]:
df.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,3,g,4,8,12
e,1,z,8,8,13
d,3,3,7,10,14
d,2,b,5,11,15


2. 맨 아래 행 출력

In [26]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,1,z,8,8,13
d,3,3,7,10,14
d,2,b,5,11,15
e,1,f,8,12,16
d,4,b,9,13,17


### 3. 특정 열 값 추출

In [27]:
print(df[df['z'] > 7])
print()
print(df[df['z'] == 8])
print()
print(df[df['z'] != 8])
print()
print(df[df['z'].isin([8, 9])]) # print(df[(df['z'] == 8) & (df['z'] == 9)]) 보다 간결하고 효율적 => 활용 : 찾을 값을 리스트로 적어두고 isin 함수 안에 넣기
print()
print(df[(df['z'] == 8) & (df['z'] == 9)])

       z  bc   c
n v ?           
e 1 z  8   8  13
    f  8  12  16
d 4 b  9  13  17

       z  bc   c
n v ?           
e 1 z  8   8  13
    f  8  12  16

       z  bc   c
n v ?           
e 3 g  4   8  12
d 3 3  7  10  14
  2 b  5  11  15
  4 b  9  13  17

       z  bc   c
n v ?           
e 1 z  8   8  13
    f  8  12  16
d 4 b  9  13  17

Empty DataFrame
Columns: [z, bc, c]
Index: []


### 4. 데이터 샘플링(랜덤 행 추출)

In [28]:
df.sample(n=3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,1,z,8,8,13
d,4,b,9,13,17
e,1,f,8,12,16


### 5. 특정 열에서 큰 순서, 작은 순서대로 불러오기

In [29]:
print(df.nlargest(3, 'z'))
print(df.nsmallest(3, 'z'))

       z  bc   c
n v ?           
d 4 b  9  13  17
e 1 z  8   8  13
    f  8  12  16
       z  bc   c
n v ?           
e 3 g  4   8  12
d 2 b  5  11  15
  3 3  7  10  14


In [30]:
print(df)

df.nunique(axis=1)

       z  bc   c
n v ?           
e 3 g  4   8  12
  1 z  8   8  13
d 3 3  7  10  14
  2 b  5  11  15
e 1 f  8  12  16
d 4 b  9  13  17


n  v  ?
e  3  g    3
   1  z    2
d  3  3    3
   2  b    3
e  1  f    3
d  4  b    3
dtype: int64

### 6. 특정 행, 열에서 추출

In [32]:
df[['z', 'bc']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,g,4,8
e,1,z,8,8
d,3,3,7,10
d,2,b,5,11
e,1,f,8,12
d,4,b,9,13


In [43]:
df.filter(regex='c') # 정규식

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,g,8,12
e,1,z,8,13
d,3,3,10,14
d,2,b,11,15
e,1,f,12,16
d,4,b,13,17


In [54]:
df.filter(regex='1', axis=0) # 정규식

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,1,z,8,8,13
e,1,f,8,12,16


In [55]:
df.filter(regex='^(?!c$).*', axis=1) # 정규식

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,g,4,8
e,1,z,8,8
d,3,3,7,10
d,2,b,5,11
e,1,f,8,12
d,4,b,9,13


In [48]:
df.filter(like='1', axis=0) # 특정 문자 포함

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,z,bc,c
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
e,1,z,8,8,13
e,1,f,8,12,16


In [52]:
df.filter(items = ['bc', 'z']) # 열 이름

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bc,z
n,v,?,Unnamed: 3_level_1,Unnamed: 4_level_1
e,3,g,8,4
e,1,z,8,8
d,3,3,10,7
d,2,b,11,5
e,1,f,12,8
d,4,b,13,9


In [59]:
df.loc[df['z'] > 5, ['bc']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bc
n,v,?,Unnamed: 3_level_1
e,1,z,8
d,3,3,10
e,1,f,12
d,4,b,13


In [63]:
print(df)
df.iat[0, 2] # 0번 인덱스에서 2번째 열 값

       z  bc   c
n v ?           
e 3 g  4   8  12
  1 z  8   8  13
d 3 3  7  10  14
  2 b  5  11  15
e 1 f  8  12  16
d 4 b  9  13  17


12

### 7. 중복 데이터

In [67]:
df = pd.DataFrame(
    {
        'a' : [4, 5, 6, 7, 7],
        'b' : [8, 9, 10, 11, 11],
        'c' : [12, 15, 14, 15, 15],
    },
    index = pd.MultiIndex.from_tuples(
        [('d', 1), ('d', 2), ('e', 1), ('e', 2), ('e', 3)],
        names = ['n', 'v']
    )
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,8,12
d,2,5,9,15
e,1,6,10,14
e,2,7,11,15
e,3,7,11,15


In [69]:
df['a'].value_counts() # 갯수 세기

7    2
4    1
5    1
6    1
Name: a, dtype: int64

In [71]:
len(df) # 데이터 프레임 행 갯수

5

In [72]:
df.shape # 데이터 프레임 행 열 구성

(5, 3)

In [79]:
print(df['a'].nunique()) # a 열에 중복되지 않는 데이터 값 갯수
print(df.nunique()) # 각 열에 중복되지 않는 데이터 값 갯수
print(df.nunique(axis = 1)) # 각 행에 중복되지 않는 데이터 값 갯수

4
a    4
b    4
c    3
dtype: int64
n  v
d  1    3
   2    3
e  1    3
   2    3
   3    3
dtype: int64


In [81]:
df.describe() # 데이터 프레임 데이터 요약

Unnamed: 0,a,b,c
count,5.0,5.0,5.0
mean,5.8,9.8,14.2
std,1.30384,1.30384,1.30384
min,4.0,8.0,12.0
25%,5.0,9.0,14.0
50%,6.0,10.0,15.0
75%,7.0,11.0,15.0
max,7.0,11.0,15.0


In [85]:
print(df)

df.drop_duplicates() # 중복 값 제거(행 값 중복)

     a   b   c
n v           
d 1  4   8  12
  2  5   9  15
e 1  6  10  14
  2  7  11  15
  3  7  11  15


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,8,12
d,2,5,9,15
e,1,6,10,14
e,2,7,11,15


# 데이터 파악하기