In [1]:
import numpy as np
import pandas as pd

# pandas
* 행에 '열 레이블index'을 부착한 n차원 행렬
* 자료구조를 제공하는 파이썬 라이브러리
* 지원하는 자료구조는 Series, DataFrame, Panel임
* 단, 0.20이후로 Panel은 deprecated 됨
* numpy 기반으로 구현되어 처리속도가 빠름
* pandas의 창시자 중 한 명은 해지펀드 애널리스트로 일하며 파이썬에서 금융 시계열을 다루기 위한 
  목적으로 개발함
* 공식사이트 : pandas.pydata.org
* 설치방법 : pip install pandas

# pandas 자료구조 1 : series
* R의 벡터와 유사한 자료구조 : 1차원 배열
* series 생성
  + pd.Series(데이터, 인덱스, 자료형)

In [2]:
# 빈 series 객체 생성
a = pd.Series()
a

  a = pd.Series()


Series([], dtype: float64)

In [4]:
# numpy 배열로 Series 객체 생성
b = pd.Series([1,2,3,4,5])
b

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [8]:
# Series 객체 생성시 인덱스 지정
c =  pd.Series([1,2,3,4,5], index = ['a','b','c','d','e'])
c

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [13]:
# Series 객체 다루기
print(c.values)
print(c.index)
print(c[2])
print(c[2:4])

[1 2 3 4 5]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
3
c    3
d    4
dtype: int64


# pandas 인덱서
* pandas 자료구조에서 정수형 인덱스를 사용하는 경우
  파이썬의 slice 연산과 혼동될 위험이 존재
* 따라서, pandas 만의 특별한 인덱서indexer 제공
  + iloc : 정수형 인덱스로 요소를 조회
  + loc : 문자형 인덱스로 요소를 조회
  + ix : 정수/문자형 인덱스로 요소를 조회 (dataframe)

In [14]:
d = pd.Series([9,8,7,6,5], index=['가','나','다','라','마'])
d

가    9
나    8
다    7
라    6
마    5
dtype: int64

In [18]:
print(d[1])
print(d[2:4])

8
다    7
라    6
dtype: int64


In [19]:
print(d.iloc[1])  # pandas 인덱스(숫자)
print(d.iloc[2:4])

8
다    7
라    6
dtype: int64


In [21]:
print(d.loc['나'])   # pandas 인덱스(문자)
print(d.loc['다':'라'])

8
다    7
라    6
dtype: int64


In [22]:
# dict로 series 객체 생성 (추천)
data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
e = pd.Series(data)
e

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [23]:
# 객체 생성후 인덱스 재설정
# 객체명.index = 리스트
e.index = ['z','y','w','x','v']
e

z    1
y    2
w    3
x    4
v    5
dtype: int64

In [33]:
# 신생아 월별 몸무게 추이 데이터
age = pd.Series([1,3,5,2,11,9,3,9,12,3])
weight = pd.Series([4.4, 5.3, 7.2, 5.2, 5.5, 7.3, 6.0, 10.4, 10.2, 6.1])
age

0     1
1     3
2     5
3     2
4    11
5     9
6     3
7     9
8    12
9     3
dtype: int64

In [34]:
weight

0     4.4
1     5.3
2     7.2
3     5.2
4     5.5
5     7.3
6     6.0
7    10.4
8    10.2
9     6.1
dtype: float64

# pandas 자료구조 2 : dataframe
* R의 데이터프레임과 유사한 자료구조 : 2차원 테이블
* pd.DataFrame(데이터, 인덱스, 컬럼레이블, 자료형)

In [35]:
# 빈 데이터프레임 생성
f = pd.DataFrame()
f

In [36]:
data = [1,2,3,4,5]
g = pd.DataFrame(data)
g    # 인덱스 자동생성, 컬럼명 없음

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [37]:
h = pd.DataFrame(data,columns=['가'])
h

Unnamed: 0,가
0,1
1,2
2,3
3,4
4,5


In [39]:
# 3행 2열 배열로 데이터프레임 생성
data = [['혜교',54],['지현',77],['수지',89]]
i = pd.DataFrame(data, columns=('이름', '국어'))
i

Unnamed: 0,이름,국어
0,혜교,54
1,지현,77
2,수지,89


In [40]:
# dict 객체로 dataframe 생성
data = {'이름':['혜교','지현','수지'], '국어':['54','79','89']}
j = pd.DataFrame(data)
j

Unnamed: 0,이름,국어
0,혜교,54
1,지현,79
2,수지,89


In [41]:
# dict 객체로 dataframe 생성
data = [{'이름':'혜교', '국어':54},
        {'이름':'지현', '국어':79},
        {'이름':'수지', '국어':89}]
j = pd.DataFrame(data)
j

Unnamed: 0,이름,국어
0,혜교,54
1,지현,79
2,수지,89


In [42]:
# series 객체로 dataframe 생성
name = pd.Series(['혜교','지현','수지'])
kor = pd.Series(['54','79','89'])
data = {'이름':name, '국어':kor}
k = pd.DataFrame(data)
k

Unnamed: 0,이름,국어
0,혜교,54
1,지현,79
2,수지,89


In [43]:
name = ['혜교','지현','수지']
kor = ['54','79','89']
data = {'이름':name, '국어':kor}
k = pd.DataFrame(data)
k

Unnamed: 0,이름,국어
0,혜교,54
1,지현,79
2,수지,89


In [62]:
pat = ['1','2','3','4']
amd = ['10/15/2015','11/01/2014','10/21/2014', '10/28/2014']
age = ['25','34','28','52']
dia = ['Type1','Type2','Type3','Type4']
sta = ['Poor', 'Improved','Excllent','Poor']
data = {'patientID':pat,'amdate':amd, 'age':age, 'diabetes':dia, 'status':sta}
patient = pd.DataFrame(data, index = [1,2,3,4])
patient

Unnamed: 0,patientID,amdate,age,diabetes,status
1,1,10/15/2015,25,Type1,Poor
2,2,11/01/2014,34,Type2,Improved
3,3,10/21/2014,28,Type3,Excllent
4,4,10/28/2014,52,Type4,Poor


In [60]:
man = np.arange(1,6)      # ['1','2','3','4','5']
date = ['10/24/14','10/28/14','10/01/14','10/12/14','05/01/14']
coun = ['US','US','UK','UK','UK']
gend = ['M','F','F','M','F']
age = ['32','45','25','39','99']
q1 = ['5','3','3','3','2']
q2 = ['4','5','5','3','2']
q3 = ['5','2','5','4','1']
q4 = ['5','5','5',None,'2']    # np.nan
q5 = ['5','5','2',np.nan,'1']
data = {'Manager':man,'Date':date, 'Country':coun, 'Gender':gend, 'Age':age, 'q1':q1, 'q2':q2, 'q3':q3, 'q4':q4, 'q5':q5}
leadership = pd.DataFrame(data, index = np.arange(1,6))
leadership

Unnamed: 0,Manager,Date,Country,Gender,Age,q1,q2,q3,q4,q5
1,1,10/24/14,US,M,32,5,4,5,5.0,5.0
2,2,10/28/14,US,F,45,3,5,2,5.0,5.0
3,3,10/01/14,UK,F,25,3,5,5,5.0,2.0
4,4,10/12/14,UK,M,39,3,3,4,,
5,5,05/01/14,UK,F,99,2,2,1,2.0,1.0


In [63]:
# 데이터프레임 각요소에 접근
# leadership에서 age컬럼 접근
# 객체명.컬럼명 또는 객체명[컬럼명]
leadership.Age

1    32
2    45
3    25
4    39
5    99
Name: Age, dtype: object

In [65]:
leadership['Age']

1    32
2    45
3    25
4    39
5    99
Name: Age, dtype: object

In [67]:
# 객체명.loc[컬럼명] 또는 객체명.iloc[컬럼인덱스]
# ix인덱서는 pandas 0.23 부터 deprecated 됩
print(leadership.loc[:, 'Age'])

1    32
2    45
3    25
4    39
5    99
Name: Age, dtype: object


In [69]:
print(leadership.iloc[:, 4])

1    32
2    45
3    25
4    39
5    99
Name: Age, dtype: object


In [74]:
# leadership에서 질문컬럼들은? 
print(leadership.iloc[:, 5:10])

  q1 q2 q3    q4   q5
1  5  4  5     5    5
2  3  5  2     5    5
3  3  5  5     5    2
4  3  3  4  None  NaN
5  2  2  1     2    1


In [76]:
print(leadership.loc[:, 'q1':'q5'])

  q1 q2 q3    q4   q5
1  5  4  5     5    5
2  3  5  2     5    5
3  3  5  5     5    2
4  3  3  4  None  NaN
5  2  2  1     2    1


In [85]:
# patient 데이터프레임에서 검사일, 당뇨병유형, 상태를 출력하세요
patient[['amdate', 'diabetes', 'status']]

Unnamed: 0,amdate,diabetes,status
1,10/15/2015,Type1,Poor
2,11/01/2014,Type2,Improved
3,10/21/2014,Type3,Excllent
4,10/28/2014,Type4,Poor


In [87]:
patient.iloc[:, [1,3,4]]

Unnamed: 0,amdate,diabetes,status
1,10/15/2015,Type1,Poor
2,11/01/2014,Type2,Improved
3,10/21/2014,Type3,Excllent
4,10/28/2014,Type4,Poor


# 외부 데이터파일을 이용해서 dataframe 객체 만들기
* csv, excel, json, xml,... 등등  지원함
* pd.read_csv(경로, 구분자, 인코딩, ...)
* dataframe 출력시 컬럼이 잘리는 경우
  + pd.set_option함수를 이용해서 출력양식을 변경함
  + pd.set_option('display.max_columns', 50)
  + pd.set_option('display.width', 100)

In [89]:
emp = pd.read_csv('data/EMPLOYEES.csv')
emp.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,,,90.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,,103.0,60.0


In [107]:
birth = pd.read_csv('data/birth-rate.csv')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 100)

In [108]:
birth.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008
0,Aruba,36.4,35.179,33.863,32.459,30.994,29.513,28.069,26.721,25.518,24.492,23.669,23.058,22.627,22.342,22.177,22.111,22.125,22.192,22.281,22.362,22.406,22.39,22.313,22.172,21.958,21.668,21.3,20.869,20.393,19.886,19.363,18.842,18.332,17.839,17.367,16.911,16.457,15.994,15.515,15.024,14.528,14.041,13.579,13.153,12.772,12.441,12.159,11.919,11.716
1,Afghanistan,52.201,52.206,52.208,52.204,52.192,52.168,52.13,52.076,52.006,51.92,51.816,51.691,51.548,51.395,51.239,51.092,50.967,50.871,50.81,50.786,50.795,50.833,50.888,50.951,51.016,51.084,51.156,51.237,51.325,51.417,51.51,51.603,51.69,51.76,51.802,51.804,51.754,51.646,51.472,51.229,50.903,50.486,49.984,49.416,48.803,48.177,47.575,47.023,46.538
2,Angola,54.432,54.394,54.317,54.199,54.04,53.836,53.585,53.296,52.984,52.668,52.376,52.137,51.967,51.875,51.861,51.92,52.033,52.172,52.314,52.444,52.554,52.644,52.721,52.789,52.841,52.88,52.907,52.918,52.903,52.847,52.722,52.49,52.141,51.679,51.123,50.522,49.941,49.427,49.003,48.662,48.355,48.005,47.545,46.936,46.184,45.33,44.444,43.607,42.875
3,Albania,40.886,40.312,39.604,38.792,37.913,37.008,36.112,35.245,34.421,33.655,32.947,32.279,31.63,30.985,30.345,29.723,29.138,28.606,28.139,27.736,27.396,27.114,26.87,26.644,26.417,26.172,25.895,25.579,25.217,24.801,24.325,23.788,23.198,22.562,21.885,21.157,20.364,19.51,18.616,17.713,16.85,16.081,15.444,14.962,14.644,14.485,14.464,14.534,14.649
4,Netherlands Antilles,32.321,30.987,29.618,28.229,26.849,25.518,24.28,23.173,22.23,21.472,20.925,20.605,20.484,20.517,20.664,20.864,21.055,21.19,21.238,21.178,21.008,20.746,20.442,20.138,19.86,19.641,19.514,19.477,19.509,19.586,19.651,19.635,19.489,19.187,18.733,18.157,17.515,16.878,16.301,15.809,15.412,15.096,14.824,14.565,14.309,14.051,13.79,13.532,13.281


In [109]:
# 데이터프레임 속성 알아보기
emp.dtypes

EMPLOYEE_ID         int64
FIRST_NAME         object
LAST_NAME          object
EMAIL              object
PHONE_NUMBER       object
HIRE_DATE          object
JOB_ID             object
SALARY              int64
COMMISSION_PCT    float64
MANAGER_ID        float64
DEPARTMENT_ID     float64
dtype: object

In [111]:
emp.shape    # 행, 열의 개수

(107, 11)

In [112]:
emp.ndim    # 데이터 프레임 차원수

2

In [113]:
emp.size   # 행 * 열

1177

In [115]:
# 데이터프레임 기본데이터 확인
emp.head()   # 상위 5개

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,,,90.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,,103.0,60.0


In [114]:
emp.tail()   # 하위 5개

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
102,202,Pat,Fay,PFAY,603.123.6666,2005-08-17,MK_REP,6000,,201.0,20.0
103,203,Susan,Mavris,SMAVRIS,515.123.7777,2002-06-07,HR_REP,6500,,101.0,40.0
104,204,Hermann,Baer,HBAER,515.123.8888,2002-06-07,PR_REP,10000,,101.0,70.0
105,205,Shelley,Higgins,SHIGGINS,515.123.8080,2002-06-07,AC_MGR,12008,,101.0,110.0
106,206,William,Gietz,WGIETZ,515.123.8181,2002-06-07,AC_ACCOUNT,8300,,205.0,110.0


In [116]:
# 데이터프레임 컬럼 요약 확인
emp.describe()   # 숫자컬럼에 대해서만 요약

Unnamed: 0,EMPLOYEE_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
count,107.0,107.0,35.0,106.0,106.0
mean,153.0,6461.831776,0.222857,124.764151,63.207547
std,31.032241,3909.579731,0.085184,20.315395,20.91011
min,100.0,2100.0,0.1,100.0,10.0
25%,126.5,3100.0,0.15,108.0,50.0
50%,153.0,6200.0,0.2,122.0,50.0
75%,179.5,8900.0,0.3,145.0,80.0
max,206.0,24000.0,0.4,205.0,110.0


In [117]:
# 범주형 데이터에 대한 빈도확인
emp.JOB_ID.value_counts()

SA_REP        30
ST_CLERK      20
SH_CLERK      20
SA_MAN         5
IT_PROG        5
FI_ACCOUNT     5
PU_CLERK       5
ST_MAN         5
AD_VP          2
MK_REP         1
AC_MGR         1
PR_REP         1
HR_REP         1
AD_PRES        1
MK_MAN         1
AD_ASST        1
PU_MAN         1
FI_MGR         1
AC_ACCOUNT     1
Name: JOB_ID, dtype: int64

In [110]:
emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   EMPLOYEE_ID     107 non-null    int64  
 1   FIRST_NAME      107 non-null    object 
 2   LAST_NAME       107 non-null    object 
 3   EMAIL           107 non-null    object 
 4   PHONE_NUMBER    107 non-null    object 
 5   HIRE_DATE       107 non-null    object 
 6   JOB_ID          107 non-null    object 
 7   SALARY          107 non-null    int64  
 8   COMMISSION_PCT  35 non-null     float64
 9   MANAGER_ID      106 non-null    float64
 10  DEPARTMENT_ID   106 non-null    float64
dtypes: float64(3), int64(2), object(6)
memory usage: 9.3+ KB
