In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)

# 데이터 정렬하기
* pandas에는 2가지 정렬이 지원
  + 인덱스로 정렬 : sort_index
  + 값으로 정렬 : sort_value

In [3]:
unsorted = pd.DataFrame(np.random.randint(1, 46, (5,2)),
                       index=[1,3,2,5,4], columns=['다','가'])
unsorted

Unnamed: 0,다,가
1,10,23
3,32,32
2,34,31
5,42,7
4,6,33


In [4]:
#  index로 정렬 : sort_index
unsorted.sort_index()

Unnamed: 0,다,가
1,10,23
2,34,31
3,32,32
4,6,33
5,42,7


In [5]:
# index를 내림차순 정렬
unsorted.sort_index(ascending=False)

Unnamed: 0,다,가
5,42,7
4,6,33
3,32,32
2,34,31
1,10,23


In [6]:
# 컬럼 index를 정렬
unsorted.sort_index(axis=1)

Unnamed: 0,가,다
1,23,10
3,32,32
2,31,34
5,7,42
4,33,6


In [7]:
# 값으로 정렬 : 정렬기준은 by로 지정
unsorted.sort_values(by='가')

Unnamed: 0,다,가
5,42,7
1,10,23
2,34,31
3,32,32
4,6,33


In [8]:
# 값으로 정렬 : 정렬기준은 by로 지정
unsorted.sort_values(by='가', ascending=False)

Unnamed: 0,다,가
4,6,33
3,32,32
2,34,31
1,10,23
5,42,7


In [9]:
# 값으로 정렬 2 : 객체명.컬럼명.sort_values()
unsorted.가.sort_values()

5     7
1    23
2    31
3    32
4    33
Name: 가, dtype: int64

# 데이터프레임 조건검색filtering
* 조건을 만족하는 특정 데이터를 추출하려면
* []에 조건연산자를 사용한 조건식을 작성하면 됨

In [10]:
# phone 데이터 초기화
phones = pd.read_csv('data/phone02.csv')
phones.head()

Unnamed: 0,makeyear,buyyear,dispsize,age,height,weight,phonetime,pctime,lteamount
0,2015,2015,5.0,45,173,75,60,500,100
1,2014,2015,4.5,27,176,59,70,30,50
2,2015,2015,5.0,29,183,65,120,300,200
3,2015,2016,5.0,28,172,63,80,60,190
4,2015,2015,5.0,24,179,65,90,30,500


In [11]:
phones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   makeyear   24 non-null     int64  
 1   buyyear    24 non-null     int64  
 2   dispsize   24 non-null     float64
 3   age        24 non-null     int64  
 4   height     24 non-null     int64  
 5   weight     24 non-null     int64  
 6   phonetime  24 non-null     int64  
 7   pctime     24 non-null     int64  
 8   lteamount  24 non-null     int64  
dtypes: float64(1), int64(8)
memory usage: 1.8 KB


In [12]:
phones.age.mean().round(2)

31.88

In [13]:
phones.height.mean().round(2)

176.58

In [14]:
phones.weight.mean().round(2)

72.88

In [15]:
phones.describe()

Unnamed: 0,makeyear,buyyear,dispsize,age,height,weight,phonetime,pctime,lteamount
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,2015.125,2015.25,5.25,31.875,176.583333,72.875,83.75,179.583333,254.166667
std,0.679674,0.675664,0.48901,7.479319,5.157659,7.809206,33.855511,154.34234,252.44744
min,2014.0,2014.0,4.5,10.0,167.0,59.0,40.0,30.0,30.0
25%,2015.0,2015.0,5.0,27.75,173.0,68.75,60.0,60.0,95.0
50%,2015.0,2015.0,5.25,32.0,176.5,71.5,75.0,105.0,165.0
75%,2016.0,2016.0,5.5,35.5,179.5,77.5,92.5,300.0,300.0
max,2016.0,2016.0,6.0,45.0,188.0,89.0,180.0,500.0,1100.0


In [16]:
# 연도별 추출 - 출시연도가 2015인 정보 추출
where = phones.makeyear == 2015
phones[where].head()

Unnamed: 0,makeyear,buyyear,dispsize,age,height,weight,phonetime,pctime,lteamount
0,2015,2015,5.0,45,173,75,60,500,100
2,2015,2015,5.0,29,183,65,120,300,200
3,2015,2016,5.0,28,172,63,80,60,190
4,2015,2015,5.0,24,179,65,90,30,500
9,2015,2014,5.0,42,177,79,40,480,30


In [17]:
# 연도별 추출 - 구매연도가 2016인 정보 추출
where = phones.buyyear == 2016
phones[where].head()

Unnamed: 0,makeyear,buyyear,dispsize,age,height,weight,phonetime,pctime,lteamount
3,2015,2016,5.0,28,172,63,80,60,190
5,2016,2016,5.5,34,175,73,80,480,160
6,2016,2016,6.0,40,169,77,60,300,170
12,2015,2016,5.5,35,173,69,60,120,150
16,2016,2016,5.5,35,184,72,90,90,70


In [18]:
# 나이별 추출 - 나이가 20 ~ 30사이인 구매자 추출
where = (phones.age >= 20) & (phones.age <= 30)
phones[where].sort_values(by='age')

Unnamed: 0,makeyear,buyyear,dispsize,age,height,weight,phonetime,pctime,lteamount
4,2015,2015,5.0,24,179,65,90,30,500
21,2016,2016,5.5,25,182,73,60,60,300
23,2016,2016,6.0,25,177,76,120,90,700
22,2015,2015,5.0,26,179,80,180,30,1100
1,2014,2015,4.5,27,176,59,70,30,50
3,2015,2016,5.0,28,172,63,80,60,190
2,2015,2015,5.0,29,183,65,120,300,200
18,2015,2015,4.5,29,188,83,140,240,600
8,2014,2014,4.5,30,175,70,100,90,300
19,2016,2016,5.5,30,175,70,50,240,50


In [19]:
# 화면크기가 4.5인 핸드폰을 구매한 소비자의 나이, 키, 몸무게 조회
where = phones.dispsize == 4.5
phones[where][['age','height','weight']]

Unnamed: 0,age,height,weight
1,27,176,59
8,30,175,70
10,32,167,62
18,29,188,83


In [20]:
# 나이가 35세 이상인 소비자들이 구매한 핸드폰의 출시연도, 화면크기 조회
where = phones.age >= 35
phones[where][['age','makeyear','dispsize']].sort_values(by=['age','makeyear'])

Unnamed: 0,age,makeyear,dispsize
12,35,2015,5.5
15,35,2015,5.0
16,35,2016,5.5
11,37,2015,5.5
7,40,2014,5.0
6,40,2016,6.0
13,41,2015,6.0
9,42,2015,5.0
0,45,2015,5.0


# seoulfood 데이터 전처리하기
+ 결측치 처리
+ 분석에 필요한 데이터만 선별

In [22]:
sf = pd.read_csv('data/seoul_food.csv')
sf.head()

  sf = pd.read_csv('data/seoul_food.csv')


Unnamed: 0.1,Unnamed: 0,개방자치단체코드,관리번호,인허가일자,인허가취소일자,영업상태코드,영업상태명,상세영업상태코드,상세영업상태명,폐업일자,휴업시작일자,휴업종료일자,재개업일자,전화번호,소재지면적,소재지우편번호,지번주소,도로명주소,도로명우편번호,사업장명,최종수정일자,데이터갱신구분,데이터갱신일자,업태구분명,좌표정보(X),좌표정보(Y),위생업태명,남성종사자수,여성종사자수,영업장주변구분명,등급구분명,급수시설구분명,총인원,본사종업원수,공장사무직종업원수,공장판매직종업원수,공장생산직종업원수,건물소유구분명,보증액,월세액,다중이용업소여부,시설총규모,전통업소지정번호,전통업소주된음식,홈페이지,지역구,open,closed,경도,위도
0,0,3010000,3010000-104-2020-00245,20201229,,1,영업/정상,1,영업,,,,,,10.0,100440.0,서울특별시 중구 황학동 2545 이마트 청계천점,서울특별시 중구 청계천로 400 지하2층 (황학동 롯데캐슬베네치아),4572.0,통불 이마트청계천점,20201229130352,I,2019-11-01 21:01:00.0,일반조리판매,201823.908977,452076.818664,,,,,,,,,,,,,,,,,,,,중구,2020,,127.021434,37.570978
1,1,3070000,3070000-104-2020-00105,20200713,,1,영업/정상,1,영업,,,,,,6.6,136110.0,서울특별시 성북구 길음동 1280-6 대우상가,서울특별시 성북구 길음로 103 대우상가 상가동 1층 105106호 (길음동),2714.0,씨유 길음푸르지오점,20200713175725,I,2019-12-06 23:05:00.0,편의점,201517.38533,456504.561867,,,,,,,,,,,,,,,,,,,,성북구,2020,,127.017973,37.610872
2,2,3230000,3230000-104-2020-00227,20200713,,1,영업/정상,1,영업,,,,,,329.89,138827.0,서울특별시 송파구 방이동 44-3 현대토픽스,서울특별시 송파구 위례성대로 6 1층 (방이동),5544.0,스타벅스,20200713162510,I,2019-12-06 23:05:00.0,커피숍,209929.574783,446085.121305,,,,,,,,,,,,,,,,,,,,송파구,2020,,127.113106,37.516942
3,3,3050000,3050000-104-2020-00106,20200713,,1,영업/정상,1,영업,,,,,,26.0,130850.0,서울특별시 동대문구 전농동 38-104,서울특별시 동대문구 전농로16길 54 1층 (전농동),2508.0,Min's coffee,20200713161446,I,2019-12-06 23:05:00.0,커피숍,205263.826419,452869.992526,,,,,,,,,,,,,,,,,,,,동대문구,2020,,127.060378,37.578112
4,4,3080000,3080000-104-2020-00065,20200713,,1,영업/정상,1,영업,,,,,,32.0,142870.0,서울특별시 강북구 수유동 413-25,서울특별시 강북구 삼양로87길 46 1층 (수유동),1090.0,탑브릭스,20200713160435,I,2019-12-06 23:05:00.0,일반조리판매,201292.010942,459312.320008,,,,,,,,,,,,,,,,,,,,강북구,2020,,127.015425,37.63617


In [23]:
sf.info()   # 결측치, 데이터건수, 데이터유형 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108372 entries, 0 to 108371
Data columns (total 50 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  108372 non-null  int64  
 1   개방자치단체코드    108372 non-null  int64  
 2   관리번호        108372 non-null  object 
 3   인허가일자       108372 non-null  int64  
 4   인허가취소일자     0 non-null       float64
 5   영업상태코드      108372 non-null  int64  
 6   영업상태명       108372 non-null  object 
 7   상세영업상태코드    108372 non-null  int64  
 8   상세영업상태명     108372 non-null  object 
 9   폐업일자        74441 non-null   float64
 10  휴업시작일자      0 non-null       float64
 11  휴업종료일자      0 non-null       float64
 12  재개업일자       0 non-null       float64
 13  전화번호        56882 non-null   object 
 14  소재지면적       100325 non-null  float64
 15  소재지우편번호     108187 non-null  float64
 16  지번주소        108189 non-null  object 
 17  도로명주소       66753 non-null   object 
 18  도로명우편번호     66077 non-null   float64
 19  사업

In [24]:
# 분석에 필요한 컬럼 확인
sf.iloc[:, [8, 9, 19, 23, 45, 46, 48, 49]].head()

Unnamed: 0,상세영업상태명,폐업일자,사업장명,업태구분명,지역구,open,경도,위도
0,영업,,통불 이마트청계천점,일반조리판매,중구,2020,127.021434,37.570978
1,영업,,씨유 길음푸르지오점,편의점,성북구,2020,127.017973,37.610872
2,영업,,스타벅스,커피숍,송파구,2020,127.113106,37.516942
3,영업,,Min's coffee,커피숍,동대문구,2020,127.060378,37.578112
4,영업,,탑브릭스,일반조리판매,강북구,2020,127.015425,37.63617


In [25]:
# 분석에 필요한 컬럼 추출해서 새로운 데이터 프레임으로 생성
df = sf.iloc[:, [8, 9, 19, 23, 45, 46, 48, 49]]
df.head()

Unnamed: 0,상세영업상태명,폐업일자,사업장명,업태구분명,지역구,open,경도,위도
0,영업,,통불 이마트청계천점,일반조리판매,중구,2020,127.021434,37.570978
1,영업,,씨유 길음푸르지오점,편의점,성북구,2020,127.017973,37.610872
2,영업,,스타벅스,커피숍,송파구,2020,127.113106,37.516942
3,영업,,Min's coffee,커피숍,동대문구,2020,127.060378,37.578112
4,영업,,탑브릭스,일반조리판매,강북구,2020,127.015425,37.63617


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108372 entries, 0 to 108371
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   상세영업상태명  108372 non-null  object 
 1   폐업일자     74441 non-null   float64
 2   사업장명     108372 non-null  object 
 3   업태구분명    108372 non-null  object 
 4   지역구      108372 non-null  object 
 5   open     108372 non-null  int64  
 6   경도       108372 non-null  float64
 7   위도       108372 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 6.6+ MB


In [27]:
# 결측치는 공백으로 대체
df = df.fillna('')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108372 entries, 0 to 108371
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   상세영업상태명  108372 non-null  object 
 1   폐업일자     108372 non-null  object 
 2   사업장명     108372 non-null  object 
 3   업태구분명    108372 non-null  object 
 4   지역구      108372 non-null  object 
 5   open     108372 non-null  int64  
 6   경도       108372 non-null  float64
 7   위도       108372 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 6.6+ MB


In [28]:
df.head()

Unnamed: 0,상세영업상태명,폐업일자,사업장명,업태구분명,지역구,open,경도,위도
0,영업,,통불 이마트청계천점,일반조리판매,중구,2020,127.021434,37.570978
1,영업,,씨유 길음푸르지오점,편의점,성북구,2020,127.017973,37.610872
2,영업,,스타벅스,커피숍,송파구,2020,127.113106,37.516942
3,영업,,Min's coffee,커피숍,동대문구,2020,127.060378,37.578112
4,영업,,탑브릭스,일반조리판매,강북구,2020,127.015425,37.63617


In [29]:
# 전처리된 데이터를 파일에 저장
df.to_csv('seoulfood2.csv')

In [30]:
# 영업/폐업 중인 업체 확인
where = df.상세영업상태명 == '영업'
df[where]['상세영업상태명'].count()

33931

In [31]:
where = df.상세영업상태명 == '폐업'
df[where]['상세영업상태명'].count()

74441

In [32]:
# 영업/폐업 중인 업체 확인 2
df.상세영업상태명.value_counts()

폐업    74441
영업    33931
Name: 상세영업상태명, dtype: int64

In [33]:
# 2019년 기준 영업/폐업 중인 업체 확인
where = (df.상세영업상태명 == '영업') & (df.open == 2019)
df[where]['상세영업상태명'].count()

4524

In [34]:
where = (df.상세영업상태명 == '폐업') & (df.open == 2019)
df[where]['상세영업상태명'].count()

2101

In [35]:
# 2019년 기준 영업/폐업 중인 업체 확인 2
where = (df.open == 2019)
df[where].상세영업상태명.value_counts()

영업    4524
폐업    2101
Name: 상세영업상태명, dtype: int64

In [36]:
# 업태별 영업장 빈도 확인
df.업태구분명.value_counts()

커피숍         27222
일반조리판매      17709
다방          17249
기타 휴게음식점    14003
패스트푸드        9748
편의점          9240
과자점          8074
백화점          2604
푸드트럭          918
철도역구내         424
아이스크림         389
전통찻집          332
떡카페           144
키즈카페          126
관광호텔           80
유원지            59
극장             19
공항             17
고속도로            7
기타              2
한식              2
호프/통닭           1
김밥(도시락)         1
룸살롱             1
단란주점            1
Name: 업태구분명, dtype: int64

In [37]:
# 영업중인 스타벅스 매장 수 - 22년 기준 571개
where = (df.사업장명 == '스타벅스') # & (df.상세영업상태명 == '영업')
df[where]['상세영업상태명'].value_counts()

영업    518
폐업    125
Name: 상세영업상태명, dtype: int64

In [38]:
# 강남구에 영업중인 스타벅스 매장 수 - 22년 기준 88개
where = (df.사업장명 == '스타벅스') & (df.지역구 == '강남구') # & (df.상세영업상태명 == '영업')
df[where]['상세영업상태명'].value_counts()

영업    82
폐업    34
Name: 상세영업상태명, dtype: int64

In [39]:
# 영업중인 맥도널드 매장 수 - 21년 기준 서울 95개
where = (df.사업장명 == '맥도날드') # & (df.상세영업상태명 == '영업')
df[where]['상세영업상태명'].value_counts()

영업    88
폐업    87
Name: 상세영업상태명, dtype: int64

# 데이터프레임 그룹핑
* 특정조건에 따라 데이터들을 그룹으로 묶어서 집계함수를 적용해서 통계정보를 출력
* 해당 그룹의 특성을 살펴볼 수 있음
* groupby(조건열).(집계함수)

In [40]:
# 과학적 표기법 출력 설정
pd.options.display.float_format = '{:,.2f}'.format

In [41]:
city = ['서울', '서울', '서울', '부산', '부산', '부산', '인천', '인천']
year = [2015, 2010, 2005, 2015, 2010, 2005, 2015, 2010]
pop = [9904312, 9631482, 9762546, 3448737, 3393191, 3512547, 2890451, 263203]
region = ['수도권', '수도권', '수도권', '경상권', '경상권', '경상권', '수도권', '수도권']

data = { '도시':city, '연도':year, '인구':pop, '지역':region }
df2 = pd.DataFrame(data, index=np.arange(1,9))
df2.head()

Unnamed: 0,도시,연도,인구,지역
1,서울,2015,9904312,수도권
2,서울,2010,9631482,수도권
3,서울,2005,9762546,수도권
4,부산,2015,3448737,경상권
5,부산,2010,3393191,경상권


In [42]:
# 도시별 그룹핑
groupby = df2.groupby('도시')
groupby.groups

{'부산': [4, 5, 6], '서울': [1, 2, 3], '인천': [7, 8]}

In [43]:
# 반복문을 이용해서 그룹핑 결과 출력
for nm, gp in groupby:
    print(nm, gp)

부산    도시    연도       인구   지역
4  부산  2015  3448737  경상권
5  부산  2010  3393191  경상권
6  부산  2005  3512547  경상권
서울    도시    연도       인구   지역
1  서울  2015  9904312  수도권
2  서울  2010  9631482  수도권
3  서울  2005  9762546  수도권
인천    도시    연도       인구   지역
7  인천  2015  2890451  수도권
8  인천  2010   263203  수도권


In [44]:
# 그룹핑 결과에 집계함수 적용 => 연도별 합산
groupby.sum()

Unnamed: 0_level_0,연도,인구
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
부산,6030,10354475
서울,6030,29298340
인천,4025,3153654


In [45]:
# 그룹핑 결과에 집계함수 적용 2 => 연도별 합산
groupby.agg(np.sum)

Unnamed: 0_level_0,연도,인구
도시,Unnamed: 1_level_1,Unnamed: 2_level_1
부산,6030,10354475
서울,6030,29298340
인천,4025,3153654


In [46]:
# 그룹핑 결과에 필터링 적용한 뒤 집계
# 도시별 총인구 출력
# groupby.sum()['인구']   # 모든 컬럼에 대해 총합을 구한 결과에서 인구컬럼만 출력
groupby['인구'].sum()     # 인구컬럼에 한해서 총합을 구한 후 결과출력

도시
부산    10354475
서울    29298340
인천     3153654
Name: 인구, dtype: int64

In [47]:
# 도시별 평균인구 출력
df2.groupby('도시')['인구'].mean()

도시
부산   3,451,491.67
서울   9,766,113.33
인천   1,576,827.00
Name: 인구, dtype: float64

In [48]:
# 연도별 총 인구 출력
df2.groupby('연도')['인구'].sum()

연도
2005    13275093
2010    13287876
2015    16243500
Name: 인구, dtype: int64

In [49]:
# 연도별 평균인구 출력
df2.groupby('연도')['인구'].mean()

연도
2005   6,637,546.50
2010   4,429,292.00
2015   5,414,500.00
Name: 인구, dtype: float64

In [50]:
# 지역별 연도별 평균인구 출력
df2.groupby(['지역','연도'])['인구'].mean()

지역   연도  
경상권  2005   3,512,547.00
     2010   3,393,191.00
     2015   3,448,737.00
수도권  2005   9,762,546.00
     2010   4,947,342.50
     2015   6,397,381.50
Name: 인구, dtype: float64

# 연도별 신생아 성별 출생수 데이터
+ 미국 질병통제센터에서 수집

In [51]:
pd.options.display.float_format = '{:,.0f}'.format
pd.set_option('styler.format.thousands', ',')

In [52]:
births = pd.read_csv('data/births.csv')
births.head()

Unnamed: 0,year,month,day,gender,births
0,1969,1,1,F,4046
1,1969,1,1,M,4440
2,1969,1,2,F,4454
3,1969,1,2,M,4548
4,1969,1,3,F,4548


In [53]:
births.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15547 entries, 0 to 15546
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    15547 non-null  int64  
 1   month   15547 non-null  int64  
 2   day     15067 non-null  float64
 3   gender  15547 non-null  object 
 4   births  15547 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 607.4+ KB


In [54]:
births.head()

Unnamed: 0,year,month,day,gender,births
0,1969,1,1,F,4046
1,1969,1,1,M,4440
2,1969,1,2,F,4454
3,1969,1,2,M,4548
4,1969,1,3,F,4548


In [55]:
births.tail()

Unnamed: 0,year,month,day,gender,births
15542,2008,10,,M,183219
15543,2008,11,,F,158939
15544,2008,11,,M,165468
15545,2008,12,,F,173215
15546,2008,12,,M,181235


In [56]:
# 연도별 출생수
births.groupby('year')['births'].sum().head(10)

year
1969    3600206
1970    3737800
1971    3563548
1972    3266235
1973    3146125
1974    3170631
1975    3153556
1976    3176476
1977    3332159
1978    3338300
Name: births, dtype: int64

In [57]:
# 월별 출생수
births.groupby('month')['births'].sum()

month
1     12364197
2     11541178
3     12678844
4     12085891
5     12624972
6     12521070
7     13367556
8     13528007
9     13252831
10    12954950
11    12197967
12    12656915
Name: births, dtype: int64

In [58]:
# 성별 출생수
births.groupby('gender')['births'].sum()

gender
F    74035823
M    77738555
Name: births, dtype: int64

In [59]:
# 연도별 월별 출생수
births.groupby(['year','month'])['births'].sum()

year  month
1969  1        293940
      2        270786
      3        296550
      4        282638
      5        289124
                ...  
2008  8        374028
      9        368660
      10       358533
      11       324407
      12       354450
Name: births, Length: 480, dtype: int64

In [60]:
# 연도별 성별 출생수
births.groupby(['year','gender'])['births'].sum()

year  gender
1969  F         1753634
      M         1846572
1970  F         1819164
      M         1918636
1971  F         1736774
                 ...   
2006  M         2188268
2007  F         2111890
      M         2212118
2008  F         2077929
      M         2177227
Name: births, Length: 80, dtype: int64

In [61]:
# 2000년 성별 출생수 1 - 조건 적용후 그룹핑
where = births.year == 2000
births[where].groupby('gender')['births'].sum()

gender
F    1984255
M    2079568
Name: births, dtype: int64

In [62]:
# 2000년 성별 출생수 2 - 그룹핑 후 조건 적용
d = births.groupby(['year']).filter(lambda x: (x.year == 2000).any())
d.groupby('gender')['births'].sum()

gender
F    1984255
M    2079568
Name: births, dtype: int64

# pandas를 sql로 조작하기
* pandasql 패키지를 이용하면 pandas 데이터프레임을 sql질의문으로 데이터를 조작할 수 있음
* sql

In [63]:
!pip install pandasql

Defaulting to user installation because normal site-packages is not writeable


In [64]:
import pandasql as pql

In [65]:
# 년도를 중복없이 출력
pql.sqldf('select distinct year from births').head()

Unnamed: 0,year
0,1969
1,1970
2,1971
3,1972
4,1973


In [81]:
# 연도별 출생수
pql.sqldf('select year, sum(births) births from births group by year').head()

Unnamed: 0,year,births
0,1969,3600206
1,1970,3737800
2,1971,3563548
3,1972,3266235
4,1973,3146125


In [83]:
# 연도별 월별 출생수
pql.sqldf('select year, month, sum(births) births from births group by month').head()

Unnamed: 0,year,month,births
0,1969,1,12364197
1,1969,2,11541178
2,1969,3,12678844
3,1969,4,12085891
4,1969,5,12624972


In [91]:
# 2000년 월별 성별 출생수
sql = '''select month, gender, sum(births) births from births
where year = 2000
group by month, gender
'''
pql.sqldf(sql)

Unnamed: 0,month,gender,births
0,1,F,161288
1,1,M,169225
2,2,F,154694
3,2,M,162997
4,3,F,166124
5,3,M,174808
6,4,F,155038
7,4,M,162495
8,5,F,166443
9,5,M,175161


In [92]:
df2

Unnamed: 0,도시,연도,인구,지역
1,서울,2015,9904312,수도권
2,서울,2010,9631482,수도권
3,서울,2005,9762546,수도권
4,부산,2015,3448737,경상권
5,부산,2010,3393191,경상권
6,부산,2005,3512547,경상권
7,인천,2015,2890451,수도권
8,인천,2010,263203,수도권


# 피벗테이블
* 소량 데이터는 별 다른 수고없이 통계분석이 가능
* 하지만, 대량의 데이터는 통계분석하기에 어려움
* 수많은 데이터 중에서 원하는 데이터만 골라서 테이블을 재구성한 것을 의미 - summary table
* pivot(행,열,대상)
* pivot_table()

In [99]:
# 도시별, 연도별 인구수
# df2.groupby(['도시','연도'])['인구'].sum()
pd.pivot_table(df2, index=['도시','연도'], values = ['인구'])

Unnamed: 0_level_0,Unnamed: 1_level_0,인구
도시,연도,Unnamed: 2_level_1
부산,2005,3512547
부산,2010,3393191
부산,2015,3448737
서울,2005,9762546
서울,2010,9631482
서울,2015,9904312
인천,2010,263203
인천,2015,2890451


In [97]:
df2.pivot('연도','도시','인구')   # 컬럼들이 옆으로 넓게 출력

도시,부산,서울,인천
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005,3512547,9762546,
2010,3393191,9631482,263203.0
2015,3448737,9904312,2890451.0


# pivot_table
* pivot과 groupby를 적당히 섞은 명령
* pivot에서 지원하지 않는 다양한 집계함수 사용가능
* pivot_table(대상,행,열,집계함수,총계여부)

In [108]:
# 도시별 연도별 인구수
df2.pivot_table('인구','도시','연도')

연도,2005,2010,2015
도시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
부산,3512547.0,3393191,3448737
서울,9762546.0,9631482,9904312
인천,,263203,2890451


In [109]:
# 연도별 도시 인구수
df2.pivot_table('인구','연도','도시')

도시,부산,서울,인천
연도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005,3512547,9762546,
2010,3393191,9631482,263203.0
2015,3448737,9904312,2890451.0


In [115]:
# 도시별 연도별 인구수(행에 2ㅐ이상의 컬럼 지정)
# df2.pivot_table(index=['연도','도시'], values = ['인구'])
df2.pivot_table('인구', ['연도','도시'])

Unnamed: 0_level_0,Unnamed: 1_level_0,인구
연도,도시,Unnamed: 2_level_1
2005,부산,3512547
2005,서울,9762546
2010,부산,3393191
2010,서울,9631482
2010,인천,263203
2015,부산,3448737
2015,서울,9904312
2015,인천,2890451


In [4]:
# 식당내 식사후 내는 팁tips에 대한 데이터를 이용한 피벗테이블 예제
tip = pd.read_csv('data/tips.csv')
tip

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
import seaborn as sns
tips = sns.load_dataset('tips')

In [8]:
# 성별 팁 현황
tips.groupby(['sex'])['tip'].sum()

sex
Male      485.07
Female    246.51
Name: tip, dtype: float64

In [7]:
# 성별 요일별 팁 현황
tips.groupby(['sex','day'])['tip'].sum()

sex     day 
Male    Thur     89.41
        Fri      26.93
        Sat     181.95
        Sun     186.78
Female  Thur     82.42
        Fri      25.03
        Sat      78.45
        Sun      60.61
Name: tip, dtype: float64

In [11]:
# 성별 요일별 팁 현황
tips.pivot_table('tip', 'sex', 'day', aggfunc = sum)   # mean

day,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,89.41,26.93,181.95,186.78
Female,82.42,25.03,78.45,60.61


In [15]:
# 팁을 준 고객의 성비 분류
tips.groupby('sex')['tip'].count()

sex
Male      157
Female     87
Name: tip, dtype: int64

In [17]:
tips.pivot_table('tip', 'sex', aggfunc='count')

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Male,157
Female,87


In [34]:
# 성별 흡연/비흡연 고객 분류
tips.groupby('sex')['smoker'].count()

sex
Male      157
Female     87
Name: smoker, dtype: int64

In [19]:
tips.pivot_table('tip','sex','smoker')

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.051167,3.113402
Female,2.931515,2.773519


In [20]:
tips.pivot_table('tip','sex','smoker', aggfunc='count')

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [28]:
# 사원데이터를 이용한 pivot_table예제
emp = pd.read_csv('data/EMPLOYEES.csv')
emp.head(20)

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,,,90.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,,100.0,90.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,,100.0,90.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,,102.0,60.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,,103.0,60.0
5,105,David,Austin,DAUSTIN,590.423.4569,2005-06-25,IT_PROG,4800,,103.0,60.0
6,106,Valli,Pataballa,VPATABAL,590.423.4560,2006-02-05,IT_PROG,4800,,103.0,60.0
7,107,Diana,Lorentz,DLORENTZ,590.423.5567,2007-02-07,IT_PROG,4200,,103.0,60.0
8,108,Nancy,Greenberg,NGREENBE,515.124.4569,2002-08-17,FI_MGR,12008,,101.0,100.0
9,109,Daniel,Faviet,DFAVIET,515.124.4169,2002-08-16,FI_ACCOUNT,9000,,108.0,100.0


In [65]:
# 사원employees 데이터에서 잭책별 사원수 조회
empcnt = emp.groupby('JOB_ID')['DEPARTMENT_ID'].count()
empcnt.sort_values(ascending=False).head()
# emp.JOB_ID.value_counts()

JOB_ID
SA_REP        29
ST_CLERK      20
SH_CLERK      20
ST_MAN         5
FI_ACCOUNT     5
Name: DEPARTMENT_ID, dtype: int64

In [66]:
empcnt = emp.pivot_table('DEPARTMENT_ID', 'JOB_ID', aggfunc='count')
empcnt.sort_values(by = 'DEPARTMENT_ID', ascending=False).head()   # 정렬을 위해 by옵션 사용

Unnamed: 0_level_0,DEPARTMENT_ID
JOB_ID,Unnamed: 1_level_1
SA_REP,29
ST_CLERK,20
SH_CLERK,20
ST_MAN,5
FI_ACCOUNT,5


In [68]:
# 사원employees 데이터에서 잭책별 부서별 사원수 조회
emp.groupby(['DEPARTMENT_ID', 'JOB_ID'])['FIRST_NAME'].count()

DEPARTMENT_ID  JOB_ID    
10.0           AD_ASST        1
20.0           MK_MAN         1
               MK_REP         1
30.0           PU_CLERK       5
               PU_MAN         1
40.0           HR_REP         1
50.0           SH_CLERK      20
               ST_CLERK      20
               ST_MAN         5
60.0           IT_PROG        5
70.0           PR_REP         1
80.0           SA_MAN         5
               SA_REP        29
90.0           AD_PRES        1
               AD_VP          2
100.0          FI_ACCOUNT     5
               FI_MGR         1
110.0          AC_ACCOUNT     1
               AC_MGR         1
Name: FIRST_NAME, dtype: int64

In [74]:
empcnt = emp.pivot_table('FIRST_NAME', 'JOB_ID', 'DEPARTMENT_ID', aggfunc='count')
empcnt.fillna('').head()

DEPARTMENT_ID,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,110.0
JOB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AC_ACCOUNT,,,,,,,,,,,1.0
AC_MGR,,,,,,,,,,,1.0
AD_ASST,1.0,,,,,,,,,,
AD_PRES,,,,,,,,,1.0,,
AD_VP,,,,,,,,,2.0,,


In [78]:
# 사원employees 데이터에서 부서별 직책별 급여 현황 : min, max
emp.groupby(['DEPARTMENT_ID','JOB_ID'])['SALARY'].min().head()

DEPARTMENT_ID  JOB_ID  
10.0           AD_ASST      4400
20.0           MK_MAN      13000
               MK_REP       6000
30.0           PU_CLERK     2500
               PU_MAN      11000
Name: SALARY, dtype: int64

In [49]:
emp.pivot_table('SALARY', ['JOB_ID', 'DEPARTMENT_ID'], aggfunc='min')

Unnamed: 0_level_0,Unnamed: 1_level_0,SALARY
JOB_ID,DEPARTMENT_ID,Unnamed: 2_level_1
AC_ACCOUNT,110.0,8300
AC_MGR,110.0,12008
AD_ASST,10.0,4400
AD_PRES,90.0,24000
AD_VP,90.0,17000
FI_ACCOUNT,100.0,6900
FI_MGR,100.0,12008
HR_REP,40.0,6500
IT_PROG,60.0,4200
MK_MAN,20.0,13000


In [80]:
emp.pivot_table('SALARY', 'JOB_ID', 'DEPARTMENT_ID', aggfunc='min').fillna('').head()

DEPARTMENT_ID,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,110.0
JOB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AC_ACCOUNT,,,,,,,,,,,8300.0
AC_MGR,,,,,,,,,,,12008.0
AD_ASST,4400.0,,,,,,,,,,
AD_PRES,,,,,,,,,24000.0,,
AD_VP,,,,,,,,,17000.0,,


In [81]:
emp.pivot_table('SALARY', 'JOB_ID', 'DEPARTMENT_ID', aggfunc='max').fillna('').head()

DEPARTMENT_ID,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,100.0,110.0
JOB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AC_ACCOUNT,,,,,,,,,,,8300.0
AC_MGR,,,,,,,,,,,12008.0
AD_ASST,4400.0,,,,,,,,,,
AD_PRES,,,,,,,,,24000.0,,
AD_VP,,,,,,,,,17000.0,,


In [50]:
emp.pivot_table('SALARY', ['JOB_ID', 'DEPARTMENT_ID'], aggfunc='max')

Unnamed: 0_level_0,Unnamed: 1_level_0,SALARY
JOB_ID,DEPARTMENT_ID,Unnamed: 2_level_1
AC_ACCOUNT,110.0,8300
AC_MGR,110.0,12008
AD_ASST,10.0,4400
AD_PRES,90.0,24000
AD_VP,90.0,17000
FI_ACCOUNT,100.0,9000
FI_MGR,100.0,12008
HR_REP,40.0,6500
IT_PROG,60.0,9000
MK_MAN,20.0,13000


# 두 데이터프레임 합치기 - join/merge
* join(dfl, dfr, how='조인조건')
* merge(dfl, dfr, on='컬럼')

In [None]:
# 조건에 따라 합치기
# 두 데이터프레임을 합치는 기준은 index
# index가 없는 데이터를 합치는 경우 NaN으로 저장

In [82]:
nums1 = [10,20,30,40,50,60,70]
nums2 = [80,90,100,110,120,]

df1 = pd.DataFrame(nums1, columns=['One'], index=['a','b','c','d','e','f','g'])
df2 = pd.DataFrame(nums2, columns=['Two'], index=['a','b','x','y','z'])

In [84]:
# 무조건 합치기
df1.join(df2)  # df1을 기준으로 일치하는 index위주로 합침

Unnamed: 0,One,Two
a,10,80.0
b,20,90.0
c,30,
d,40,
e,50,
f,60,
g,70,


In [86]:
df1.join(df2, how='inner')   # 일치하는 index위주로만 합침

Unnamed: 0,One,Two
a,10,80
b,20,90


In [87]:
df1.join(df2, how='outer')  # 일치하는 index위주로 합치고 나머진 NaN

Unnamed: 0,One,Two
a,10.0,80.0
b,20.0,90.0
c,30.0,
d,40.0,
e,50.0,
f,60.0,
g,70.0,
x,,100.0
y,,110.0
z,,120.0


In [88]:
df1.join(df2, how='left')   # df1을 기준으로 outer join

Unnamed: 0,One,Two
a,10,80.0
b,20,90.0
c,30,
d,40,
e,50,
f,60,
g,70,


In [89]:
df1.join(df2, how='right')   # df2을 기준으로 outer join

Unnamed: 0,One,Two
a,10.0,80
b,20.0,90
x,,100
y,,110
z,,120


In [90]:
# 지정한 컬럼을 기준으로 합치기
df1 = pd.DataFrame({
    'id':[1,2,3,4,5],
    'name':['혜교','지현','수지','시나','중기'],
    'ban': [1,2,3,6,7] })

df2 = pd.DataFrame({
    'id':[1,2,3,4,5],
    'name':[99,87,52,63,48],
    'ban': [1,2,3,4,5] })

In [91]:
# 각 데이터프레임에 공통으로 존재하는 컬럼을 기준으로 합침
df1.merge(df2, on='id')

Unnamed: 0,id,name_x,ban_x,name_y,ban_y
0,1,혜교,1,99,1
1,2,지현,2,87,2
2,3,수지,3,52,3
3,4,시나,6,63,4
4,5,중기,7,48,5


In [None]:
df1.merge(df2, on='name')   # 컬럼간 자료형이 일치하지 않음 - 합침 실패

In [92]:
df1.merge(df2, on='ban')  # 일치하는 값만 합침 - 부분 성공

Unnamed: 0,id_x,name_x,ban,id_y,name_y
0,1,혜교,1,1,99
1,2,지현,2,2,87
2,3,수지,3,3,52


In [93]:
# 일치하지 않는 값들도 합침 - how
df1.merge(df2, on='ban', how='left')

Unnamed: 0,id_x,name_x,ban,id_y,name_y
0,1,혜교,1,1.0,99.0
1,2,지현,2,2.0,87.0
2,3,수지,3,3.0,52.0
3,4,시나,6,,
4,5,중기,7,,


In [None]:
df1.merge(df2, on='ban', how='right')

In [None]:
df1.merge(df2, on='ban', how='outer')

In [102]:
# EMPLOYEES와 DEPARTMENTS데이터 프레임 합쳐보세요
emp = pd.read_csv('data/EMPLOYEES.csv')
dep = pd.read_csv('data/DEPARTMENTS.csv')
dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DEPARTMENT_ID    27 non-null     int64  
 1   DEPARTMENT_NAME  27 non-null     object 
 2   MANAGER_ID       11 non-null     float64
 3   LOCATION_ID      27 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 992.0+ bytes


In [112]:
dep.DEPARTMENT_ID = dep.DEPARTMENT_ID.astype(object)

In [114]:
emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   EMPLOYEE_ID     107 non-null    int64  
 1   FIRST_NAME      107 non-null    object 
 2   LAST_NAME       107 non-null    object 
 3   EMAIL           107 non-null    object 
 4   PHONE_NUMBER    107 non-null    object 
 5   HIRE_DATE       107 non-null    object 
 6   JOB_ID          107 non-null    object 
 7   SALARY          107 non-null    int64  
 8   COMMISSION_PCT  35 non-null     float64
 9   MANAGER_ID      106 non-null    float64
 10  DEPARTMENT_ID   106 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 9.3+ KB


In [119]:
#컬럼기반으로 합침
emp.merge(dep, on='DEPARTMENT_ID').head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID_x,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID_y,LOCATION_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,,,90.0,Executive,100.0,1700
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,,100.0,90.0,Executive,100.0,1700
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,,100.0,90.0,Executive,100.0,1700
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,,102.0,60.0,IT,103.0,1400
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,,103.0,60.0,IT,103.0,1400


In [124]:
# 데이터프레임에 동일한 이름의 컬럼이 존재 - join시 실패함 - 컬럼명 변경
dep.columns = ['DDEPARTMENT_ID', 'DDEPARTMENT_NAME', 'DMANAGER_ID', 'LOCATION_ID'] 
dep.head()

Unnamed: 0,DDEPARTMENT_ID,DDEPARTMENT_NAME,DMANAGER_ID,LOCATION_ID
0,10,Administration,200.0,1700
1,20,Marketing,201.0,1800
2,30,Purchasing,114.0,1700
3,40,Human Resources,203.0,2400
4,50,Shipping,121.0,1500


In [125]:
emp.fillna(0, inplace=True)
emp.DEPARTMENT_ID = emp.DEPARTMENT_ID.astype(int)
emp.join(dep, how='outer')

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,DDEPARTMENT_ID,DDEPARTMENT_NAME,DMANAGER_ID,LOCATION_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,0.0,0.0,90,10,Administration,200.0,1700.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,0.0,100.0,90,20,Marketing,201.0,1800.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,0.0,100.0,90,30,Purchasing,114.0,1700.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,0.0,102.0,60,40,Human Resources,203.0,2400.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,0.0,103.0,60,50,Shipping,121.0,1500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,202,Pat,Fay,PFAY,603.123.6666,2005-08-17,MK_REP,6000,0.0,201.0,20,,,,
103,203,Susan,Mavris,SMAVRIS,515.123.7777,2002-06-07,HR_REP,6500,0.0,101.0,40,,,,
104,204,Hermann,Baer,HBAER,515.123.8888,2002-06-07,PR_REP,10000,0.0,101.0,70,,,,
105,205,Shelley,Higgins,SHIGGINS,515.123.8080,2002-06-07,AC_MGR,12008,0.0,101.0,110,,,,


In [127]:
# index 기반으로 합쳤기 때문에 올바른 결과를 보장할 수 없음
empdept = emp.join(dep, how='outer')
empdept.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,DDEPARTMENT_ID,DDEPARTMENT_NAME,DMANAGER_ID,LOCATION_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,0.0,0.0,90,10,Administration,200.0,1700.0
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,0.0,100.0,90,20,Marketing,201.0,1800.0
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,0.0,100.0,90,30,Purchasing,114.0,1700.0
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,0.0,102.0,60,40,Human Resources,203.0,2400.0
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,0.0,103.0,60,50,Shipping,121.0,1500.0
