# Pandas 기초
---


In [2]:
import numpy as np
import pandas as pd 

## 1. 데이터 불러오기

### (1) excel 데이터 불러오기

In [3]:
sample_1 = pd.read_excel('./files/sample_1.xlsx', 
                         header=1, 
                         skipfooter=2, 
                         usecols='A:C')
sample_1

Unnamed: 0,국적코드,성별,입국객수
0,A01,남성,106320
1,A01,여성,191436
2,A31,남성,319
3,A31,여성,42
4,A18,남성,158912
5,A18,여성,232943


In [7]:
sample_2 = pd.read_excel('./files/sample_1.xlsx', 
                         header=1, 
                         skipfooter=2, 
                         usecols='A:C',
                        names=['A','B','C'])
sample_2

Unnamed: 0,A,B,C
0,A01,남성,106320
1,A01,여성,191436
2,A31,남성,319
3,A31,여성,42
4,A18,남성,158912
5,A18,여성,232943


In [10]:
sample_1.dtypes

국적코드    object
성별      object
입국객수     int64
dtype: object

In [13]:
# int를 float으로 바꾸기
sample_1 = pd.read_excel('./files/sample_1.xlsx', 
                         header=1, 
                         skipfooter=2, 
                         usecols='A:C',
                        dtype={'입국객수':np.float64})
sample_1

Unnamed: 0,국적코드,성별,입국객수
0,A01,남성,106320.0
1,A01,여성,191436.0
2,A31,남성,319.0
3,A31,여성,42.0
4,A18,남성,158912.0
5,A18,여성,232943.0


### (2) 데이터 구성 살펴보기

In [5]:
sample_1.head()

Unnamed: 0,국적코드,성별,입국객수
0,A01,남성,106320
1,A01,여성,191436
2,A31,남성,319
3,A31,여성,42
4,A18,남성,158912


In [6]:
sample_1.tail()

Unnamed: 0,국적코드,성별,입국객수
1,A01,여성,191436
2,A31,남성,319
3,A31,여성,42
4,A18,남성,158912
5,A18,여성,232943


In [7]:
sample_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   국적코드    6 non-null      object
 1   성별      6 non-null      object
 2   입국객수    6 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes


In [10]:
sample_1.columns

Index(['국적코드', '성별', '입국객수'], dtype='object')

In [8]:
# 연속형 자료 분석
sample_1.describe()

Unnamed: 0,입국객수
count,6.0
mean,114995.333333
std,98105.752006
min,42.0
25%,26819.25
50%,132616.0
75%,183305.0
max,232943.0


In [12]:
# 범주형 자료 (이산형) 분석
sample_1['성별'].value_counts()

남성    3
여성    3
Name: 성별, dtype: int64

### (3) csv 데이터 불러오기

In [16]:
fish=pd.read_csv('https://bit.ly/fish_csv',encoding='utf-8') # encoding='euc-kr'
fish

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.5200,4.0200
1,Bream,290.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.7300,4.4555
4,Bream,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...
154,Smelt,12.2,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,14.3,15.2,2.8728,2.0672


---
## 2. 데이터 선택하기

### (1) column 기준

In [13]:
sample_1['입국객수']

0    106320
1    191436
2       319
3        42
4    158912
5    232943
Name: 입국객수, dtype: int64

In [14]:
sample_1[['국적코드', '입국객수']]

Unnamed: 0,국적코드,입국객수
0,A01,106320
1,A01,191436
2,A31,319
3,A31,42
4,A18,158912
5,A18,232943


In [15]:
sample_1['기준년월'] = '2019-11'
sample_1 

Unnamed: 0,국적코드,성별,입국객수,기준년월
0,A01,남성,106320,2019-11
1,A01,여성,191436,2019-11
2,A31,남성,319,2019-11
3,A31,여성,42,2019-11
4,A18,남성,158912,2019-11
5,A18,여성,232943,2019-11


### (2) row 기준

In [25]:
condition = (sample_1['성별'] == '남성')
print(condition)
sample_1[condition]

0     True
1    False
2     True
3    False
4     True
5    False
Name: 성별, dtype: bool


Unnamed: 0,국적코드,성별,입국객수,기준년월
0,A01,남성,106320,2019-11
2,A31,남성,319,2019-11
4,A18,남성,158912,2019-11


In [28]:
sample_1[~(sample_1['성별'] == '남성')]

Unnamed: 0,국적코드,성별,입국객수,기준년월
0,A01,남성,106320,2019-11
2,A31,남성,319,2019-11
4,A18,남성,158912,2019-11


In [21]:
condition = (sample_1['입국객수'] >= 150000) 
sample_1[condition]

Unnamed: 0,국적코드,성별,입국객수,기준년월
1,A01,여성,191436,2019-11
4,A18,남성,158912,2019-11
5,A18,여성,232943,2019-11


In [23]:
conditions = (sample_1['성별'] == '남성') & (sample_1['입국객수'] >= 150000)
print(conditions)
sample_1[conditions]

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool


Unnamed: 0,국적코드,성별,입국객수,기준년월
4,A18,남성,158912,2019-11


In [29]:
conditions = (sample_1['성별'] == '남성') \
    & (sample_1['입국객수'] >= 150000)

sample_1[conditions]

Unnamed: 0,국적코드,성별,입국객수,기준년월
4,A18,남성,158912,2019-11


In [30]:
conditions = (sample_1['국적코드'] == 'A01') \
                | (sample_1['국적코드'] == 'A18')
sample_1[conditions]

Unnamed: 0,국적코드,성별,입국객수,기준년월
0,A01,남성,106320,2019-11
1,A01,여성,191436,2019-11
4,A18,남성,158912,2019-11
5,A18,여성,232943,2019-11


### (3) isin method

In [31]:
conditions = (sample_1['국적코드'].isin(['A01', 'A18'])) 
print(conditions) 
sample_1[conditions]

0     True
1     True
2    False
3    False
4     True
5     True
Name: 국적코드, dtype: bool


Unnamed: 0,국적코드,성별,입국객수,기준년월
0,A01,남성,106320,2019-11
1,A01,여성,191436,2019-11
4,A18,남성,158912,2019-11
5,A18,여성,232943,2019-11


In [32]:
conditions = (sample_1['국적코드'].isin(['A01', 'A18'])) 
sample_1[conditions == False]

Unnamed: 0,국적코드,성별,입국객수,기준년월
2,A31,남성,319,2019-11
3,A31,여성,42,2019-11


---
## 3. 데이터를 좌우로 통합 ( pd.merge )

In [34]:
code_master = pd.read_excel('./files/sample_codemaster.xlsx')
code_master

Unnamed: 0,국적코드,국적명
0,A01,일본
1,A02,대만
2,A03,홍콩
3,A18,중국
4,A19,이란
5,A22,우즈베키스탄
6,A23,카자흐스탄
7,A99,아시아 기타


### (1) left 조건

In [35]:
sample_1_code = pd.merge(left=sample_1, 
                         right=code_master,
                         how='left',
                         left_on='국적코드',
                         right_on='국적코드')
sample_1_code

Unnamed: 0,국적코드,성별,입국객수,기준년월,국적명
0,A01,남성,106320,2019-11,일본
1,A01,여성,191436,2019-11,일본
2,A31,남성,319,2019-11,
3,A31,여성,42,2019-11,
4,A18,남성,158912,2019-11,중국
5,A18,여성,232943,2019-11,중국


### (2) inner 조건

In [36]:
sample_1_code_inner = pd.merge(left=sample_1, 
                               right=code_master,
                               how='inner',
                               left_on='국적코드',
                               right_on='국적코드')
sample_1_code_inner

Unnamed: 0,국적코드,성별,입국객수,기준년월,국적명
0,A01,남성,106320,2019-11,일본
1,A01,여성,191436,2019-11,일본
2,A18,남성,158912,2019-11,중국
3,A18,여성,232943,2019-11,중국


---
## 4. 데이터를 상하로 통합 ( pd. append )

In [37]:
sample_2 = pd.read_excel('./files/sample_2.xlsx', 
                         header=1, 
                         skipfooter=2, 
                         usecols='A:C')
sample_2['기준년월']='2019-12'
sample_2_code = pd.merge(left=sample_2, 
                         right=code_master,
                         how='left',
                         left_on='국적코드',
                         right_on='국적코드')
sample_2_code

Unnamed: 0,국적코드,성별,입국객수,기준년월,국적명
0,A01,남성,92556,2019-12,일본
1,A01,여성,163737,2019-12,일본
2,A18,남성,155540,2019-12,중국
3,A18,여성,249023,2019-12,중국


In [38]:
sample = sample_1_code.append(sample_2_code, ignore_index=True)
sample

Unnamed: 0,국적코드,성별,입국객수,기준년월,국적명
0,A01,남성,106320,2019-11,일본
1,A01,여성,191436,2019-11,일본
2,A31,남성,319,2019-11,
3,A31,여성,42,2019-11,
4,A18,남성,158912,2019-11,중국
5,A18,여성,232943,2019-11,중국
6,A01,남성,92556,2019-12,일본
7,A01,여성,163737,2019-12,일본
8,A18,남성,155540,2019-12,중국
9,A18,여성,249023,2019-12,중국


In [39]:
sample_1_code.append(sample_2_code)

Unnamed: 0,국적코드,성별,입국객수,기준년월,국적명
0,A01,남성,106320,2019-11,일본
1,A01,여성,191436,2019-11,일본
2,A31,남성,319,2019-11,
3,A31,여성,42,2019-11,
4,A18,남성,158912,2019-11,중국
5,A18,여성,232943,2019-11,중국
0,A01,남성,92556,2019-12,일본
1,A01,여성,163737,2019-12,일본
2,A18,남성,155540,2019-12,중국
3,A18,여성,249023,2019-12,중국


---
## 5. 데이터 저장하기 ( to_excel )

In [40]:
sample.to_excel('./files/sample.xlsx')

In [41]:
sample.to_excel('./files/sample_index_false.xlsx', index=False)

---
## 6. 데이터 집계하기( pivot_table )

In [42]:
sample_pivot = sample.pivot_table(values='입국객수',
                                  index='국적명',
                                  columns='기준년월',
                                  aggfunc='mean' )
sample_pivot

기준년월,2019-11,2019-12
국적명,Unnamed: 1_level_1,Unnamed: 2_level_1
일본,148878.0,128146.5
중국,195927.5,202281.5


In [43]:
sample_pivot_2 = sample.pivot_table(values='입국객수',
                                    index='국적명',
                                    aggfunc='max')
sample_pivot_2

Unnamed: 0_level_0,입국객수
국적명,Unnamed: 1_level_1
일본,191436
중국,249023
