# pandas의 데이터 구조

1. Series 데이터
2. 데이터 프레임

![image.png](attachment:image.png)

In [2]:
import numpy as np
import pandas as pd

## 데이터 프레임 생성

### 리스트로 데이터 프레임 만들기

In [3]:
# 2차원 리스트로 데이터 프레임 생성
df = pd.DataFrame([['a', 'b', 'c'],['a', 'a', 'g'],['a', 'a']])
df

Unnamed: 0,0,1,2
0,a,b,c
1,a,a,g
2,a,a,


In [4]:
# 열 방향 인덱스(문자) 행방향 인덱스(숫자)가 있는 데이터 프레임 생성
df1 = pd.DataFrame({'A':[90,80,70],
                    'B':[85,98,75],
                    'C':[88,99,77],
                    'D':[87,89,86]})
df1

Unnamed: 0,A,B,C,D
0,90,85,88,87
1,80,98,99,89
2,70,75,77,86


In [5]:
# 딕셔너리로 데이터 프레임 생성
data = {
    "2015": [9904312, 3448737, 2890451, 2466052],
    "2010": [9631482, 3393191, 2632035, 2000002],
    "2005": [9762546, 3512547, 2517680, 2456016],
    "2000": [9853972, 3655437, 2466338, 2473990],
    "지역": ["수도권", "경상권", "수도권", "경상권"],
    "2010-2015 증가율":[0.0283, 0.0163, 0.0982,0.0141]
}
data

{'2015': [9904312, 3448737, 2890451, 2466052],
 '2010': [9631482, 3393191, 2632035, 2000002],
 '2005': [9762546, 3512547, 2517680, 2456016],
 '2000': [9853972, 3655437, 2466338, 2473990],
 '지역': ['수도권', '경상권', '수도권', '경상권'],
 '2010-2015 증가율': [0.0283, 0.0163, 0.0982, 0.0141]}

In [6]:
# 열방향 인덱스 columns=
columns = ['지역','2015','2010','2005','2000','2010-2015 증가율']
# 행방향 인덱스 index =
index = ['서울','부산','인천','대구']
# DataFrame(data, index=, columns)
df2 = pd.DataFrame(data, index = index, columns = columns)
df2

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2000002,2456016,2473990,0.0141


시리즈로 데이터 프레임 생성

In [7]:
a = pd.Series([100,200,300], ['a','b','d'])
b = pd.Series([101,201,301], ['a','b','k'])
c = pd.Series([110,210,310], ['a','b','c'])
pd.DataFrame([a,b,c], index = [100,101,102])

Unnamed: 0,a,b,d,k,c
100,100.0,200.0,300.0,,
101,101.0,201.0,,301.0,
102,110.0,210.0,,,310.0


**csv 데이터로부터 DataFrame 생성**

In [9]:
# data출처
trainData = pd.read_csv('./data/train.csv')
trainData.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


read_csv 함수 파라미터

In [15]:
trainData = pd.read_csv('./data/train.csv',
                        index_col='PassengerId',
                        usecols=['PassengerId','Survived','Pclass','Name'])
trainData

Unnamed: 0_level_0,Survived,Pclass,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,3,"Braund, Mr. Owen Harris"
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
3,1,3,"Heikkinen, Miss. Laina"
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
5,0,3,"Allen, Mr. William Henry"
...,...,...,...
887,0,2,"Montvila, Rev. Juozas"
888,1,1,"Graham, Miss. Margaret Edith"
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie"""
890,1,1,"Behr, Mr. Karl Howell"


**인덱스와 컬럼의 이해**

![5_dataframe2.png](attachment:5_dataframe2.png)

In [16]:
df2

Unnamed: 0,지역,2015,2010,2005,2000,2010-2015 증가율
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2000002,2456016,2473990,0.0141


In [17]:
# 열방향 인덱스 출력
df2.columns

Index(['지역', '2015', '2010', '2005', '2000', '2010-2015 증가율'], dtype='object')

In [18]:
# 행방향 인덱스 출력
df2.index

Index(['서울', '부산', '인천', '대구'], dtype='object')

**행/열 인덱스 이름 설정**

* index.name
* columns.name

In [19]:
df2.index.name = '도시'
df2.columns.name = '특성'
df2

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2000002,2456016,2473990,0.0141


In [20]:
# 데이터만 접근하려면 values 속성을 사용

df2.values
# 반환값은 array 형태

array([['수도권', 9904312, 9631482, 9762546, 9853972, 0.0283],
       ['경상권', 3448737, 3393191, 3512547, 3655437, 0.0163],
       ['수도권', 2890451, 2632035, 2517680, 2466338, 0.0982],
       ['경상권', 2466052, 2000002, 2456016, 2473990, 0.0141]], dtype=object)

In [21]:
# DataFrame의 개요 출력
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 서울 to 대구
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   지역             4 non-null      object 
 1   2015           4 non-null      int64  
 2   2010           4 non-null      int64  
 3   2005           4 non-null      int64  
 4   2000           4 non-null      int64  
 5   2010-2015 증가율  4 non-null      float64
dtypes: float64(1), int64(4), object(1)
memory usage: 224.0+ bytes


In [22]:
# DataFrame의 기술 통계 출력, 탐색할때 많이 쓰는 통계분석 : 수치형 데이터에 대하여
df2.describe()

특성,2015,2010,2005,2000,2010-2015 증가율
count,4.0,4.0,4.0,4.0,4.0
mean,4677388.0,4414178.0,4562197.0,4612434.0,0.039225
std,3507776.0,3524531.0,3500545.0,3538749.0,0.039809
min,2466052.0,2000002.0,2456016.0,2466338.0,0.0141
25%,2784351.0,2474027.0,2502264.0,2472077.0,0.01575
50%,3169594.0,3012613.0,3015114.0,3064714.0,0.0223
75%,5062631.0,4952764.0,5075047.0,5205071.0,0.045775
max,9904312.0,9631482.0,9762546.0,9853972.0,0.0982


In [23]:
# (행,열)의 개수 출력
df2.shape

(4, 6)

**데이터 프레임 전치**

In [24]:
# data 확인
df2

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2000002,2456016,2473990,0.0141


In [25]:
# df2 전치 : .T 속성
dfT = df2.T
dfT
# 원본데이터는 변경하지 않는다.

도시,서울,부산,인천,대구
특성,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
지역,수도권,경상권,수도권,경상권
2015,9904312,3448737,2890451,2466052
2010,9631482,3393191,2632035,2000002
2005,9762546,3512547,2517680,2456016
2000,9853972,3655437,2466338,2473990
2010-2015 증가율,0.0283,0.0163,0.0982,0.0141


데이터 프레임 내용 변경

In [26]:
# 사용예제
df2

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,0.0283
부산,경상권,3448737,3393191,3512547,3655437,0.0163
인천,수도권,2890451,2632035,2517680,2466338,0.0982
대구,경상권,2466052,2000002,2456016,2473990,0.0141


해당열이 있으면 내용 갱신, 열이 없으면 추가

In [29]:
# 열변경 : 증가율을 퍼센트값으로 변경
df2['2010-2015 증가율'] = df2['2010-2015 증가율']*100
df2

특성,지역,2015,2010,2005,2000,2010-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,2.83
부산,경상권,3448737,3393191,3512547,3655437,1.63
인천,수도권,2890451,2632035,2517680,2466338,9.82
대구,경상권,2466052,2000002,2456016,2473990,1.41


In [32]:
# 열추가
df2['2005-2015 증가율']=((df2['2015']-df2['2005'])/df2['2005']*100).round(2)
df2

특성,지역,2015,2010,2005,2000,2010-2015 증가율,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
서울,수도권,9904312,9631482,9762546,9853972,1.45,1.45
부산,경상권,3448737,3393191,3512547,3655437,-1.82,-1.82
인천,수도권,2890451,2632035,2517680,2466338,14.81,14.81
대구,경상권,2466052,2000002,2456016,2473990,0.41,0.41


In [33]:
# 열삭제, del df[삭제열]
del df2['2010-2015 증가율']
df2

특성,지역,2015,2010,2005,2000,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,1.45
부산,경상권,3448737,3393191,3512547,3655437,-1.82
인천,수도권,2890451,2632035,2517680,2466338,14.81
대구,경상권,2466052,2000002,2456016,2473990,0.41


## 데이터 프레임 인덱싱

1. 열인덱싱
2. 인덱서를 사용하지 않는 행 인덱싱

*[ ]기호를 이용해서 인덱싱할때 

**1. 열 인덱싱**
* 열 라벨(컬럼명)을 키값으로 생각하고 인덱싱한다.

In [34]:
# 인덱스로 라벨값 1개 사용
df2['지역']

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

In [35]:
# 열 1개를 접근할때는 연산자 사용가능 : df.컬럼명
df2.지역

도시
서울    수도권
부산    경상권
인천    수도권
대구    경상권
Name: 지역, dtype: object

In [36]:
# 지역 컬럼 데이터 유형 확인
type(df2['지역'])
# pandas.core.series.Series

pandas.core.series.Series

In [37]:
# 열을 추출할 때 df로 반환받고자 하면 []를 사용
# 리스트로 인덱싱 : df로 반환
df2[['지역']]

특성,지역
도시,Unnamed: 1_level_1
서울,수도권
부산,경상권
인천,수도권
대구,경상권


In [38]:
type(df2[['지역']])

pandas.core.frame.DataFrame

In [39]:
# 여러개의 열을 추출
# []리스트 사용 : df 반환
df2[['2010','2015']]

특성,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
서울,9631482,9904312
부산,3393191,3448737
인천,2632035,2890451
대구,2000002,2466052


In [40]:
# 위치적으로 맨 처음 열을 반환받기 위해 위치 인덱스 사용
try : 
    df2[0] # keyerror 발생
except Exception as e :
    print(type(e))
# 문자로 되어있을때 숫자로 찾을수 없음

<class 'KeyError'>


In [42]:
df3 = pd.DataFrame(np.arange(12).reshape(3,4))
df3

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [43]:
# 컬럼명이 숫자로 되어 있는 df의 접근
df3[0]

0    0
1    4
2    8
Name: 0, dtype: int32

In [44]:
df3[[1,2]]

Unnamed: 0,1,2
0,1,2
1,5,6
2,9,10


In [45]:
df2

특성,지역,2015,2010,2005,2000,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,1.45
부산,경상권,3448737,3393191,3512547,3655437,-1.82
인천,수도권,2890451,2632035,2517680,2466338,14.81
대구,경상권,2466052,2000002,2456016,2473990,0.41


In [46]:
# 1행 추출[:1] - 슬라이싱 사용
df2[:1]

특성,지역,2015,2010,2005,2000,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,1.45


In [47]:
# 1행 추출
df2[0:1]

특성,지역,2015,2010,2005,2000,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,1.45


In [48]:
# 시작값:끝값+1
df2[1:3]

특성,지역,2015,2010,2005,2000,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
부산,경상권,3448737,3393191,3512547,3655437,-1.82
인천,수도권,2890451,2632035,2517680,2466338,14.81


In [49]:
# 행 인덱스 '서울'부터 '부산'까지 추출
df2['서울':'부산']

특성,지역,2015,2010,2005,2000,2005-2015 증가율
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
서울,수도권,9904312,9631482,9762546,9853972,1.45
부산,경상권,3448737,3393191,3512547,3655437,-1.82


In [50]:
df2['2015']['서울']

9904312

In [51]:
# 원소값의 형태가 출력 - 정수
type(df2['2015']['서울'])

numpy.int64

In [52]:
df2['2015']['서울':'부산']

도시
서울    9904312
부산    3448737
Name: 2015, dtype: int64

* 데이터 프레임은 열 기준으로 접근 원친 : [열이름]
* 행 기준 접근을 위해서는 슬라이싱을 적용 : [행이름:행이름]
* 행과 열을 같이 적용(개별요소) : [열이름][행이름]