# 데이터 정제 및 준비

### 누락된 데이터 처리하기

pandas의 설계 목표 중 하나는 누락 데이터를 가능한 쉽게 처리하기 위한 것이며, pandas 객체의 모든 기술 통계는 누락된 데이터를 배제하고 처리한다. 

In [3]:
import pandas as pd

In [8]:
import numpy as np

In [9]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [10]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [11]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [12]:
string_data[0]=None

In [13]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [14]:
from numpy import nan as NA

In [16]:
data = pd.Series([1,NA,3.5,NA,7])

In [17]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [20]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [22]:
data = pd.DataFrame([[1., 6.5, 3.],[1.,NA, NA],[NA,NA,NA],[NA,6.5,3.]])

In [23]:
cleaned=data.dropna()

In [24]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [25]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [26]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [27]:
data[4]=NA

In [28]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [29]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [71]:
df=pd.DataFrame(np.random.randn(7,3))

In [72]:
df.iloc[:4,1]=NA

In [73]:
df.iloc[:2,2]=NA

In [74]:
df

Unnamed: 0,0,1,2
0,-0.756792,,
1,-0.058348,,
2,-0.46191,,-1.444658
3,0.63392,,0.681099
4,0.570577,-0.989565,1.854622
5,-1.202152,-0.342133,-0.029291
6,0.998303,0.529934,0.838557


In [58]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.550946,0.597623,-1.052153
5,-0.433382,-1.804558,-0.663917
6,-1.307812,0.773147,-0.370455


In [68]:
df.dropna(thresh=0)

Unnamed: 0,0,1,2
0,0.79123,,
1,0.40344,,
2,-0.822997,,1.435364
3,-1.678237,,-1.322897
4,-0.550946,0.597623,-1.052153
5,-0.433382,-1.804558,-0.663917
6,-1.307812,0.773147,-0.370455


### 결측치 채우기

In [36]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.404576,0.0,0.0
1,0.589897,0.0,0.0
2,-0.572062,0.0,-1.1636
3,-0.873397,0.0,-0.250119
4,2.494087,-0.016279,-0.949458
5,1.130329,0.169844,0.677362
6,-0.377328,0.090287,-0.818282


In [37]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.404576,0.5,0.0
1,0.589897,0.5,0.0
2,-0.572062,0.5,-1.1636
3,-0.873397,0.5,-0.250119
4,2.494087,-0.016279,-0.949458
5,1.130329,0.169844,0.677362
6,-0.377328,0.090287,-0.818282


In [77]:
df.fillna(0,inplace=True)

In [79]:
df = df.fillna(0,inplace=False)

In [78]:
df

Unnamed: 0,0,1,2
0,-0.756792,0.0,0.0
1,-0.058348,0.0,0.0
2,-0.46191,0.0,-1.444658
3,0.63392,0.0,0.681099
4,0.570577,-0.989565,1.854622
5,-1.202152,-0.342133,-0.029291
6,0.998303,0.529934,0.838557


In [82]:
df=pd.DataFrame(np.random.randn(6,3))

In [83]:
df.iloc[2:,1]=NA

In [84]:
df.iloc[4:,2]=NA

In [85]:
df

Unnamed: 0,0,1,2
0,-0.082296,0.650948,0.803439
1,-0.257616,1.382691,0.342189
2,-0.279963,,0.71664
3,0.02315,,-0.103432
4,-1.30919,,
5,-1.135282,,


In [86]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.082296,0.650948,0.803439
1,-0.257616,1.382691,0.342189
2,-0.279963,1.382691,0.71664
3,0.02315,1.382691,-0.103432
4,-1.30919,1.382691,-0.103432
5,-1.135282,1.382691,-0.103432


In [87]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,-0.082296,0.650948,0.803439
1,-0.257616,1.382691,0.342189
2,-0.279963,1.382691,0.71664
3,0.02315,1.382691,-0.103432
4,-1.30919,,-0.103432
5,-1.135282,,-0.103432


In [90]:
data=pd.Series([1.,NA,3.5,NA,7])

In [91]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### 7.2 데이터 변형

In [93]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                  'k2':[1,1,2,3,3,4,4]})

In [97]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [95]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [96]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [99]:
data['v1']=range(7)

In [100]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [104]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [105]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                  'ounces':[4,3,12,6,7.5,8,3,5,6]})

In [106]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [107]:
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}

In [108]:
lowercased = data['food'].str.lower()

In [109]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [110]:
data['animal']=lowercased.map(meat_to_animal)

In [111]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [114]:
data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [116]:
data=pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [117]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [118]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [119]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [120]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [122]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [123]:
data=pd.DataFrame(np.random.randn(1000,4))

In [136]:
data

Unnamed: 0,0,1,2,3
0,0.342376,-1.192437,2.455470,-0.296499
1,-1.909509,-0.411592,-0.390481,0.248747
2,-0.653984,1.283866,-0.401360,1.055154
3,1.535897,-0.172516,0.136223,-0.115049
4,1.957143,0.103232,0.786178,-0.443535
...,...,...,...,...
995,1.112046,0.264724,0.002540,-1.157099
996,1.091094,1.154429,-0.486078,0.157418
997,-0.389635,-0.550804,-1.220365,3.495158
998,-0.810757,-0.119985,-0.010078,1.893086


In [138]:
data.loc[18,:]

0   -0.048394
1    0.469133
2   -3.393726
3    0.640260
Name: 18, dtype: float64

In [124]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.009852,0.007158,-0.096381,-0.032679
std,1.055194,1.021648,1.004302,0.987093
min,-3.396881,-3.281761,-4.031914,-3.201267
25%,-0.702627,-0.72557,-0.765071,-0.691157
50%,0.043241,0.028202,-0.112754,-0.063397
75%,0.774041,0.680751,0.560329,0.607738
max,3.581586,3.347656,3.208617,3.546501


In [125]:
col=data[2]

In [126]:
col[np.abs(col)>3]

18    -3.393726
201   -4.031914
232    3.208617
987   -3.159187
Name: 2, dtype: float64

In [151]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
18,-0.048394,0.469133,-3.393726,0.64026
164,3.107245,0.726557,0.662063,0.778161
183,-3.201858,-0.583457,0.073176,-0.755553
199,0.361298,-0.86909,-0.092324,-3.201267
201,-0.625714,0.408232,-4.031914,1.512441
232,1.264031,0.04533,3.208617,-0.155564
247,3.581586,-1.146069,-0.105243,0.29786
280,1.300629,-3.281761,0.064064,-0.330092
535,-3.396881,-1.35863,-1.239757,1.536453
616,1.258431,3.347656,2.826049,-0.122612


In [152]:
data[np.abs(data)>3]=np.sign(data)*3

In [153]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.009818,0.007093,-0.095005,-0.033814
std,1.0511,1.019698,0.998411,0.982026
min,-3.0,-3.0,-3.0,-3.0
25%,-0.702627,-0.72557,-0.765071,-0.691157
50%,0.043241,0.028202,-0.112754,-0.063397
75%,0.774041,0.680751,0.560329,0.607738
max,3.0,3.0,3.0,3.0


In [154]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,-1.0
1,-1.0,-1.0,-1.0,1.0
2,-1.0,1.0,-1.0,1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,1.0,1.0,-1.0
