In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

# 데이터 정제
- 누락(NaN) 데이터
- 중복데이터
- 데이터 치환
- 연속형 데이터의 카테고리화
- 그룹화
- mapping
- 색인

## 1. NA 처리 메서드

- `dropna` : 누락된 데이터가 있는 축(행,열)을 제외
- `fillna` : 누락 데이터를 대신한 값으로 채움 or ffill, bfill
- `isnull` : 누락 데이터를 추출
- `notnull` : isnull에 반대되는 데이터를 추출

In [12]:
stringData=pd.Series(["aaa","bbb",np.nan,"ccc"])
print(stringData)
print(stringData.isnull())
print(stringData[stringData.isnull()])

0    aaa
1    bbb
2    NaN
3    ccc
dtype: object
0    False
1    False
2     True
3    False
dtype: bool
2    NaN
dtype: object


In [13]:
stringData[0]=None #None은 NA와 같음

In [14]:
stringData

0    None
1     bbb
2     NaN
3     ccc
dtype: object

In [15]:
stringData.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [16]:
from numpy import nan as NA
data=pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [17]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

---

In [19]:
data=pd.DataFrame([[1, 6, 3],
             [1,NA,NA],
             [NA,NA,NA],
             [NA,5,2]])
data

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
2,,,
3,,5.0,2.0


In [20]:
data.dropna(axis=0)

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [21]:
data.dropna(axis=1)

0
1
2
3


In [22]:
data.dropna() #data.dropna(axis=0) default : axis=0

Unnamed: 0,0,1,2
0,1.0,6.0,3.0


In [23]:
data.dropna(how="all") #모두 nan인 경우만 제거

Unnamed: 0,0,1,2
0,1.0,6.0,3.0
1,1.0,,
3,,5.0,2.0


## 2. 중복 처리 메소드
- `duplicated()` : 중복 확인
- `drop_duplicates()` : 중복 제거

In [24]:
data=pd.DataFrame({'a':['one','two']*3+['two'],
                  'b':[1,1,2,3,3,4,4]})
data.info()
data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       7 non-null      object
 1   b       7 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 240.0+ bytes


Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [25]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [26]:
data.drop_duplicates() #duplicated 함수 결과가 False인 데이터프레임을 리턴

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [27]:
data['v1']=range(7)
data

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [28]:
#전체 컬럼에 대한 중복값을 제외
data.drop_duplicates() 

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [29]:
#특정 컬럼에 대한 중복값을 제외
data.drop_duplicates(['a'])

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1


In [30]:
#특정 컬럼들에 대한 중복값을 제외
data.drop_duplicates(['a', 'b'])

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [31]:
#특정 컬럼들에 대한 중복값을 제외 (but, 중복데이터 중 뒤쪽에 있는 데이터를 남김)
data.drop_duplicates(['a', 'b'], keep='last')

Unnamed: 0,a,b,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 3. 데이터치환 메소드  
- `map`함수
- `replace`함수 : 딕셔너리구조 활용

## 4. 연속형 데이터의 카테고리화
- 연속성 데이터의 경우 카테고리화가 필요할 때 데이터를 구간 별로 나눈다.
- `cut` & `bins` 함수

In [32]:
ages=[-3,15,20,25,28,30,20,22,37,61,44,46,33,111]
bins=[0,10,20,30,40,60,100]
#연령(...)데이터 -> 연령대로 나눔 -> 연령대별로 카테고리화
res=pd.cut(ages, bins)

In [33]:
res

[NaN, (10.0, 20.0], (10.0, 20.0], (20.0, 30.0], (20.0, 30.0], ..., (60.0, 100.0], (40.0, 60.0], (40.0, 60.0], (30.0, 40.0], NaN]
Length: 14
Categories (6, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] < (40, 60] < (60, 100]]

In [34]:
# 데이터가 속하는 구간
# -1은 NaN 값을 나타냄.(어느 구간에도 속하지 않을 경우)
res.codes

array([-1,  1,  1,  2,  2,  2,  1,  2,  3,  5,  4,  4,  3, -1], dtype=int8)

In [35]:
res.categories

IntervalIndex([(0, 10], (10, 20], (20, 30], (30, 40], (40, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [36]:
res.value_counts()
#pd.value_counts(res)

(0, 10]      0
(10, 20]     3
(20, 30]     4
(30, 40]     2
(40, 60]     2
(60, 100]    1
dtype: int64

---

In [39]:
ages=[15,20,25,28,30,20,22,37,61,44,46,33]

In [40]:
# right 옵션 : 기본은 True
pd.cut(ages, [15,26,36,61,100]) # 구간 : 초과~이하

[NaN, (15.0, 26.0], (15.0, 26.0], (26.0, 36.0], (26.0, 36.0], ..., (36, 61], (36, 61], (36, 61], (36, 61], (26, 36]]
Length: 12
Categories (4, interval[int64]): [(15, 26] < (26, 36] < (36, 61] < (61, 100]]

In [41]:
#right 옵션 : False일때
pd.cut(ages, [15,26,36,61,100], right=False)  # 구간 : 이상~미만

[[15, 26), [15, 26), [15, 26), [26, 36), [26, 36), ..., [36, 61), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[15, 26) < [26, 36) < [36, 61) < [61, 100)]

In [43]:
#label옵션
gn=['youth', 'youngyouth','middleaged', 'senior']
pd.cut(ages, [15,26,36,61,100],labels=gn) #구간을 라벨 이름으로 나타냄

[NaN, 'youth', 'youth', 'youngyouth', 'youngyouth', ..., 'middleaged', 'middleaged', 'middleaged', 'middleaged', 'youngyouth']
Length: 12
Categories (4, object): ['youth' < 'youngyouth' < 'middleaged' < 'senior']

In [44]:
#qcut : 각 구간의 데이터의 건수를 동일하게 나눈 것
res=pd.qcut(ages,4)
res.value_counts()

(14.999, 21.5]    3
(21.5, 29.0]      3
(29.0, 38.75]     3
(38.75, 61.0]     3
dtype: int64

## 5. 그룹화
- `groupby()` : 그룹단위로 집계(요약)

In [None]:
# 순서 : 전체데이터 -> 그룹별로 분할 -> 각 그룹별로 집계(요약)함수 적용 -> 각 그룹별 집계 결과들 -> 합침

# 성별  데이터 
# 남     11       =>    남 : 100
# 여     22    =>       여  : 200
# ...

In [45]:
abalone=pd.read_csv("data/abalone.txt", sep=",", header=None, names=["sex", 'length', "diameter",
                                                       "height", "whole_weight",
                                                       "shucked_weight", "viscera_weight",
                                                       "shell_weight","rings"])

In [46]:
abalone

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [47]:
np.sum(abalone.isnull())

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

In [None]:
# whole_weight 칼럼 : 전복 성별 그룹별 전체 무게 변수에 대해 집계

In [48]:
#abalone 데이터의 sex에 따른 그룹화
grouped=abalone['whole_weight'].groupby(abalone['sex'])

In [49]:
grouped.size()
#abalone의 'sex' 그룹화 -> 각 그룹별 whole_weight 컬럼값의 size()호출 결과

sex
F    1307
I    1342
M    1528
Name: whole_weight, dtype: int64

In [50]:
grouped.sum()
#abalone의 'sex' 그룹화 -> 각 그룹별 whole_weight 컬럼값의 sum()호출 결과

sex
F    1367.8175
I     578.8885
M    1514.9500
Name: whole_weight, dtype: float64

In [51]:
grouped.mean()
#abalone의 'sex' 그룹화 -> 각 그룹별 whole_weight 컬럼값의 mean()호출 결과

sex
F    1.046532
I    0.431363
M    0.991459
Name: whole_weight, dtype: float64

In [52]:
abalone.groupby(abalone['sex']).sum() #abalone.groupby('sex').sum() 과 같은 표현

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,756.875,594.335,206.52,1367.8175,583.1675,301.51,394.727,14546
I,574.035,438.155,144.93,578.8885,256.369,123.4775,172.0205,10589
M,857.805,671.23,231.31,1514.95,661.5415,329.352,430.849,16358


In [53]:
abalone.groupby(abalone['sex']).mean() # abalone.groupby('sex').mean() 과 같은 표현

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.579093,0.454732,0.158011,1.046532,0.446188,0.230689,0.30201,11.129304
I,0.427746,0.326494,0.107996,0.431363,0.191035,0.09201,0.128182,7.890462
M,0.561391,0.439287,0.151381,0.991459,0.432946,0.215545,0.281969,10.705497


In [None]:
# length 칼럼

In [54]:
abalone.length

0       0.455
1       0.350
2       0.530
3       0.440
4       0.330
        ...  
4172    0.565
4173    0.590
4174    0.600
4175    0.625
4176    0.710
Name: length, Length: 4177, dtype: float64

In [56]:

#범주형(2가지) 변수(length_med) 추가: length값이 length열의 중앙값보다 크면 
#length_long, 그렇지 않으면 length_short

# length     length_med
# 0.455     length_short
# 0.350     length_short
# 0.95      length_long
# ...

#np.where(조건, 참, 거짓)

#abalone['length_med'] = (abalone.length>abalone.length.median()).map({True:'length_long',False:'length_short'})

#abalone["length_med"] = np.where(abalone["length"] > abalone["length"].median(), "length_long", "length_short")
#abalone[["length", "length_med"]]

#abalone['Length_label']=pd.cut(abalone.Length, [0,abalone.Length.median(),1], labels=['length_short', 'length_long'])


abalone["length_med"] = np.where(abalone["length"] > abalone["length"].median(), 
                                 "length_long", "length_short")
abalone

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,length_med
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,length_short
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,length_short
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,length_short
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,length_short
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,length_short
...,...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,length_long
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,length_long
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,length_long
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,length_long


In [57]:
abalone[["length", "length_med"]]

Unnamed: 0,length,length_med
0,0.455,length_short
1,0.350,length_short
2,0.530,length_short
3,0.440,length_short
4,0.330,length_short
...,...,...
4172,0.565,length_long
4173,0.590,length_long
4174,0.600,length_long
4175,0.625,length_long


In [None]:
# 그룹화

In [58]:
mean_weight=abalone['whole_weight'].groupby([abalone['sex'],abalone['length_med']]).mean()
mean_weight

sex  length_med  
F    length_long     1.261330
     length_short    0.589702
I    length_long     0.923215
     length_short    0.351234
M    length_long     1.255182
     length_short    0.538157
Name: whole_weight, dtype: float64

In [59]:
mean_weight.unstack()

length_med,length_long,length_short
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.26133,0.589702
I,0.923215,0.351234
M,1.255182,0.538157


In [None]:
#그룹별로 특정 작업을 반복을 하고 싶을 때!
#방법 : abalone 성별로 그룹화 -> for loop -> 그룹별 데이터셋을 출력

In [60]:
abalone[['sex','length_med', 'whole_weight', 'rings']]

Unnamed: 0,sex,length_med,whole_weight,rings
0,M,length_short,0.5140,15
1,M,length_short,0.2255,7
2,F,length_short,0.6770,9
3,M,length_short,0.5160,10
4,I,length_short,0.2050,7
...,...,...,...,...
4172,F,length_long,0.8870,11
4173,M,length_long,0.9660,10
4174,M,length_long,1.1760,9
4175,F,length_long,1.0945,10


In [61]:
abalone[['sex','length_med', 'whole_weight', 'rings']].groupby('sex')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001F4C721F160>

In [62]:
#그룹화 객체를 for문으로 반복하면, 그룹 이름(M, F, I)과 그룹별 데이터를 리턴할 수 있음
for sex, group_data in abalone[['sex','length_med', 'whole_weight', 'rings']].groupby('sex'):
    print(sex), print(group_data[:5]) # 그룹이름 / 그룹의 데이터

F
   sex    length_med  whole_weight  rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
9    F   length_long        0.8945     19
10   F  length_short        0.6065     14
I
   sex    length_med  whole_weight  rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short        0.2255     10
42   I  length_short        0.0700      5
M
   sex    length_med  whole_weight  rings
0    M  length_short        0.5140     15
1    M  length_short        0.2255      7
3    M  length_short        0.5160     10
8    M  length_short        0.5095      9
11   M  length_short        0.4060     10


In [63]:
#그룹(sex, length_med 조합)별 데이터셋 출력
for (sex,length_med), group_data in abalone[['sex','length_med', 'whole_weight', 'rings']].groupby(['sex', 'length_med']):
    print(sex, length_med), print(group_data[:5])

F length_long
   sex   length_med  whole_weight  rings
9    F  length_long        0.8945     19
22   F  length_long        0.9395     12
23   F  length_long        0.7635      9
24   F  length_long        1.1615     10
25   F  length_long        0.9285     11
F length_short
   sex    length_med  whole_weight  rings
2    F  length_short        0.6770      9
6    F  length_short        0.7775     20
7    F  length_short        0.7680     16
10   F  length_short        0.6065     14
13   F  length_short        0.6845     10
I length_long
    sex   length_med  whole_weight  rings
509   I  length_long        0.8735     16
510   I  length_long        1.1095     10
549   I  length_long        0.8750     11
550   I  length_long        1.1625     17
551   I  length_long        0.9885     13
I length_short
   sex    length_med  whole_weight  rings
4    I  length_short        0.2050      7
5    I  length_short        0.3515      8
16   I  length_short        0.2905      7
21   I  length_short    

In [64]:
#{키:값, 키:값, 키:값}
#{'F':F그룹 데이터셋, 'M':M그룹 데이터셋, 'I':I그룹 데이터셋}
list(abalone[:10][['sex','length_med', 'whole_weight', 'rings']].groupby('sex'))

[('F',   sex    length_med  whole_weight  rings
  2   F  length_short        0.6770      9
  6   F  length_short        0.7775     20
  7   F  length_short        0.7680     16
  9   F   length_long        0.8945     19),
 ('I',   sex    length_med  whole_weight  rings
  4   I  length_short        0.2050      7
  5   I  length_short        0.3515      8),
 ('M',   sex    length_med  whole_weight  rings
  0   M  length_short        0.5140     15
  1   M  length_short        0.2255      7
  3   M  length_short        0.5160     10
  8   M  length_short        0.5095      9)]

In [65]:
#성별로 그룹화 -> 성별 그룹을 key로 설정 -> 데이터셋은 value로 설정
aba_group=dict(list(abalone[:10][['sex','length_med', 'whole_weight', 'rings']].groupby('sex')))
aba_group

{'F':   sex    length_med  whole_weight  rings
 2   F  length_short        0.6770      9
 6   F  length_short        0.7775     20
 7   F  length_short        0.7680     16
 9   F   length_long        0.8945     19,
 'I':   sex    length_med  whole_weight  rings
 4   I  length_short        0.2050      7
 5   I  length_short        0.3515      8,
 'M':   sex    length_med  whole_weight  rings
 0   M  length_short        0.5140     15
 1   M  length_short        0.2255      7
 3   M  length_short        0.5160     10
 8   M  length_short        0.5095      9}

In [66]:
#그룹이름을 가지고 데이터셋을 인덱싱
aba_group['M']

Unnamed: 0,sex,length_med,whole_weight,rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


In [67]:
#abalone의 상위 10개 데이터에 대해 'sex'이 'M'인 자료 추출(불린참조)
abalone[:10][abalone['sex']=='M'][['sex','length_med', 'whole_weight', 'rings']]

  


Unnamed: 0,sex,length_med,whole_weight,rings
0,M,length_short,0.514,15
1,M,length_short,0.2255,7
3,M,length_short,0.516,10
8,M,length_short,0.5095,9


## 6. Mapping
- `dict.get()` : 특정 문자열들을 매핑 규칙에 맞게 변환하는 함수  

    *ex) Lee, lee, LEE => lee*  
    *ex) Choi, choi, Cho, CHO,... -> others*

In [77]:
df=pd.DataFrame({'name':['kim','KIM','Kim','lee', 'LEE','Lee', 'cho','choi'],
             'value1':[1,2,3,4,5,6,7,8],
             'value2':[100,200,300,100,200,100,300,500]})
df

Unnamed: 0,name,value1,value2
0,kim,1,100
1,KIM,2,200
2,Kim,3,300
3,lee,4,100
4,LEE,5,200
5,Lee,6,100
6,cho,7,300
7,choi,8,500


In [78]:
# 매핑규칙 만들기
nameMapping={
    "KIM":"kim", 
    "Kim":"kim",
    "LEE":"lee", #np.char.lower("LEE")로 사용해도 됌 -> 소문자로 변형하는 함수
    "Lee":"lee",
    "cho":"others",
    "ch":"others"    
}

In [79]:
#get(매핑되는 값, 매핑되는 값이 없을 때 출력할 값)

#매핑 규칙에 정의되지 않은 키가 전달되면 None이 리턴
#func=lambda x:nameMapping.get(x)

#매핑 규칙에 정의되지 않은 키(kim, lee)가 전달되면 "etc" 리턴
#func=lambda x:nameMapping.get(x, "etc") 

#매핑 규칙에 정의되지 않은 키(kim, lee)가 전달되면 그 값 그대로(kim, lee) 리턴
func=lambda x:nameMapping.get(x, x) 

In [80]:
df['name2']=df.name.map(func)

In [81]:
df

Unnamed: 0,name,value1,value2,name2
0,kim,1,100,kim
1,KIM,2,200,kim
2,Kim,3,300,kim
3,lee,4,100,lee
4,LEE,5,200,lee
5,Lee,6,100,lee
6,cho,7,300,others
7,choi,8,500,choi


In [82]:
#name2컬럼값을 그룹화 -> 그룹별 합계
df.groupby('name2').sum()

Unnamed: 0_level_0,value1,value2
name2,Unnamed: 1_level_1,Unnamed: 2_level_1
choi,8,500
kim,6,600
lee,15,400
others,7,300


In [83]:
df.groupby(['name2', 'name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
name2,name,Unnamed: 2_level_1,Unnamed: 3_level_1
choi,choi,8,500
kim,KIM,2,200
kim,Kim,3,300
kim,kim,1,100
lee,LEE,5,200
lee,Lee,6,100
lee,lee,4,100
others,cho,7,300


## 7. Fomatting

In [84]:
df=pd.DataFrame({'id':[1,2,10,20,100,200],
             'name':['aa','aa2','aa3','aa4','aa5','aa6']})
df

Unnamed: 0,id,name
0,1,aa
1,2,aa2
2,10,aa3
3,20,aa4
4,100,aa5
5,200,aa6


In [207]:
#df에 id2열을 추가
#id2열은 id열 값을 5자리로 만들었을때, 앞에 빈자리를 0으로 채움
#ex) id가 1인 경우 -> 00001
#ex) id가 200인 경우 -> 00200

df['id2']=df['id'].apply(lambda x: "{:0<5d}".format(x))
#df['id2'] = df['id'].astype(str).apply(lambda x: x.zfill(5))

Unnamed: 0,id,name,id2
0,1,aa,100
1,2,aa2,200
2,10,aa3,1000
3,20,aa4,2000
4,100,aa5,1000
5,200,aa6,2000
