In [100]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [101]:
df = pd.DataFrame(np.random.randn(5,3),
                 columns = ['c1','c2','c3'])

In [102]:
df

Unnamed: 0,c1,c2,c3
0,1.955228,-0.043338,-1.786994
1,-1.953754,0.089222,2.140163
2,-1.257043,-0.2387,-0.505911
3,-1.314095,0.876532,-0.49591
4,0.354478,-1.121267,0.606198


In [103]:
df.ix[0,0] = np.nan

In [104]:
df.ix[1,['c1','c3']] = np.nan

In [105]:
df.ix[2,['c2']] = None

In [106]:
df.fillna(0)

Unnamed: 0,c1,c2,c3
0,0.0,-0.043338,-1.786994
1,0.0,0.089222,0.0
2,-1.257043,0.0,-0.505911
3,-1.314095,0.876532,-0.49591
4,0.354478,-1.121267,0.606198


In [107]:
df.fillna(method='ffill')

Unnamed: 0,c1,c2,c3
0,,-0.043338,-1.786994
1,,0.089222,-1.786994
2,-1.257043,0.089222,-0.505911
3,-1.314095,0.876532,-0.49591
4,0.354478,-1.121267,0.606198


In [108]:
df.fillna(df.mean())

Unnamed: 0,c1,c2,c3
0,-0.738887,-0.043338,-1.786994
1,-0.738887,0.089222,-0.545654
2,-1.257043,-0.049712,-0.505911
3,-1.314095,0.876532,-0.49591
4,0.354478,-1.121267,0.606198


In [109]:
df.where(pd.notnull(df),df.mean(), axis='columns')

Unnamed: 0,c1,c2,c3
0,-0.738887,-0.043338,-1.786994
1,-0.738887,0.089222,-0.545654
2,-1.257043,-0.049712,-0.505911
3,-1.314095,0.876532,-0.49591
4,0.354478,-1.121267,0.606198


In [110]:
arr = np.array([1,2,3,4,5,10,11,15,20,30,0.1,2.5,5])

In [111]:
np.min(arr)

0.1

In [112]:
np.argmin(arr) # 최소 값의 위치가 출력됨
np.max(arr)

30.0

In [113]:
np.argmax(arr)

9

In [114]:
np.where(arr>=10,0,arr)

array([1. , 2. , 3. , 4. , 5. , 0. , 0. , 0. , 0. , 0. , 0.1, 2.5, 5. ])

In [115]:
df.fillna(df.mean()['c1':'c2'])

Unnamed: 0,c1,c2,c3
0,-0.738887,-0.043338,-1.786994
1,-0.738887,0.089222,
2,-1.257043,-0.049712,-0.505911
3,-1.314095,0.876532,-0.49591
4,0.354478,-1.121267,0.606198


In [123]:
df2 = pd.DataFrame({'c1' : [1,2,3,4,5]
             , 'c2' : [6,7,8,9,10]})
df2.ix[[1,3],['c2']]=np.nan
df2['c2p'] = np.where(pd.notnull(df2['c2'])==True, df2['c2'], df2['c1'])

In [125]:
df2

Unnamed: 0,c1,c2,c2p
0,1,6.0,6.0
1,2,,2.0
2,3,8.0,8.0
3,4,,4.0
4,5,10.0,10.0


In [128]:
df = pd.DataFrame({'c1' : [1,2,3,4,5]
             , 'c2' : [6,7,8,9,10]})

df.ix[[1,3],['c2']]=np.nan


for i in df.index:
    if pd.notnull(df.ix[i,'c2']) == True:
        df.ix[i,'c2p'] = df.ix[i,'c2']
    else:
        df.ix[i,'c2p'] = df.ix[i,'c1']
df

Unnamed: 0,c1,c2,c2p
0,1,6.0,6.0
1,2,,2.0
2,3,8.0,8.0
3,4,,4.0
4,5,10.0,10.0


In [129]:
# dropna 메서드 : 열(axis = 1) / 행(axis = 0)

In [135]:
df = pd.DataFrame({'c1' : [1,2,3,4,5]
             , 'c2' : [6,7,8,9,10]})

df.ix[[1,3],['c2']]=np.nan
df

Unnamed: 0,c1,c2
0,1,6.0
1,2,
2,3,8.0
3,4,
4,5,10.0


In [138]:
df.dropna(axis=0)

Unnamed: 0,c1,c2
0,1,6.0
2,3,8.0
4,5,10.0


In [139]:
df.dropna(axis=1)

Unnamed: 0,c1
0,1
1,2
2,3
3,4
4,5


In [170]:
# 결측값 보간 interpolate
'''
시계열데이터에 대해 선형적으로 비례하는 값
이미지 보간 : 그라데이션과 비슷
'''

from datetime import datetime
#datetime 모듈에 있는 datetime 함수
dateStr = ['1/13/2020','1/16/2020',
           '1/17/2020','1/20/2020']
dates = pd.to_datetime(dateStr)
dates

DatetimeIndex(['2020-01-13', '2020-01-16', '2020-01-17', '2020-01-20'], dtype='datetime64[ns]', freq=None)

In [176]:
ts = pd.Series([1,np.nan, np.nan, 10],index=dates)

In [177]:
ts

2020-01-13     1.0
2020-01-16     NaN
2020-01-17     NaN
2020-01-20    10.0
dtype: float64

In [178]:
tslr = ts.interpolate()
tslr

2020-01-13     1.0
2020-01-16     4.0
2020-01-17     7.0
2020-01-20    10.0
dtype: float64

In [180]:
tsTime=ts.interpolate(method = 'time')
tsTime

2020-01-13     1.000000
2020-01-16     4.857143
2020-01-17     6.142857
2020-01-20    10.000000
dtype: float64

In [181]:
tsTime=ts.interpolate(method = 'time', limit = 1)
tsTime

2020-01-13     1.000000
2020-01-16     4.857143
2020-01-17          NaN
2020-01-20    10.000000
dtype: float64

ValueError: method must be one of ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh', 'piecewise_polynomial', 'pchip', 'akima', 'spline', 'from_derivatives']. Got 'avg' instead.

In [188]:
s = pd.Series([1,2,3,4,None])
s

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
dtype: float64

In [189]:
s.replace(3,9)

0    1.0
1    2.0
2    9.0
3    4.0
4    NaN
dtype: float64

In [192]:
s.replace(np.nan,5)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [194]:
s.replace([1,2,3],[6,7,8]) # 한번에 여러개를 replace 가능

0    6.0
1    7.0
2    8.0
3    4.0
4    NaN
dtype: float64

In [197]:
s.replace({1:5}) # dictionary 형태로 바꿀 자리와 값을 변경 가능

0    5.0
1    2.0
2    3.0
3    4.0
4    NaN
dtype: float64

In [200]:
df = pd.DataFrame({'c1' : ['aaa','bbb','ccc','d'],
                  'c2' : [1,2,3,4],
                  'c3' : [5,6,7,np.nan]})
df

Unnamed: 0,c1,c2,c3
0,aaa,1,5.0
1,bbb,2,6.0
2,ccc,3,7.0
3,d,4,


In [202]:
df.replace({'c1':'aaa'},{'c1':'bbbb'})

Unnamed: 0,c1,c2,c3
0,bbbb,1,5.0
1,bbb,2,6.0
2,ccc,3,7.0
3,d,4,


In [203]:
# 병합 : 중복 데이터 발생 가능
# 중복데이터 여부 : duplicated()
# 중복데이터 처리 : drop_duplicates() 한 개만 남기고 다 지움

In [221]:
df=pd.DataFrame({
    'k1' : ['b','b','b','c','c'],
    'k2' : ['x','y','y','x','z'],
    'col' : [10,20,30,40,50]
})
df

Unnamed: 0,k1,k2,col
0,b,x,10
1,b,y,20
2,b,y,30
3,c,x,40
4,c,z,50


In [222]:
df.duplicated(['k1','k2'])
df.duplicated(['k2'],keep='last')

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [224]:
df.duplicated(['k1'],keep='first')

0    False
1     True
2     True
3    False
4     True
dtype: bool

In [228]:
df
df.drop_duplicates(['k1'],keep='last')

Unnamed: 0,k1,k2,col
2,b,y,30
4,c,z,50


In [237]:
# 유일한 값 : unique()
# 유일한 값 개수 세기 : value_counts()
df = pd.DataFrame({
    'a' : ['a1','a1','a2','a2','a3'],
    'b' : ['b1','b1','b2','b2', np.nan],
    'c' : [1,1,3,4,4],
})
df['a'].unique()
df['b'].unique()

array(['b1', 'b2', nan], dtype=object)

In [241]:
# normalize = 디폴트 false : 개수, True : 상대비율

df['a'].value_counts()
df['a'].value_counts(normalize=False)

a2    2
a1    2
a3    1
Name: a, dtype: int64

In [243]:
df['a'].value_counts(normalize=False,sort=True)

a2    2
a1    2
a3    1
Name: a, dtype: int64

In [250]:
df['c'].value_counts(bins=[0,1,2,3,4,5], sort=False)

(-0.001, 1.0]    2
(1.0, 2.0]       0
(2.0, 3.0]       1
(3.0, 4.0]       2
(4.0, 5.0]       0
Name: c, dtype: int64

In [252]:
res = pd.cut(df['c'],bins=[0,1,2,3,4,5])

In [253]:
pd.value_counts(res)

(3, 4]    2
(0, 1]    2
(2, 3]    1
(4, 5]    0
(1, 2]    0
Name: c, dtype: int64

In [254]:
# 변수들 간 척도가 다른 경우 데이터 표준화
# 모집단 : 전체 집단, 모집단에서 표본추출 -> 표본평균, 표본편차 
# -> 통계량 => 모평균, 모분산 추정
'''
모수적방법
중심극한정리 : 무작위로 복원추출하면, 연속형 자료의 평균에
대한 분포는 정규분포를 띈다면 - 30개 이상의 표본의 경우
정규분포를 따른다.

비모수적방법
-10개 미만의 표본의 경우에는 모수적 방법을 사용
-자료를 크기로 나열 -> 순위 매김 => 차이 비교

표준화 : (각데이터 - 평균) / 표준편차
모집단이 정규분포를 따르는 경우에,
평균:0, 표준편차 1인 표준정규분포로 표준화
1) numpy 2)scipy.state:zscore 3) sklearn.preprocessing
'''


'\n모수적방법\n중심극한정리 : 무작위로 복원추출하면, 연속형 자료의 평균에\n대한 분포는 정규분포를 띈다면 - 30개 이상의 표본의 경우\n정규분포를 따른다.\n\n비모수적방법\n-10개 미만의 표본의 경우에는 모수적 방법을 사용\n-자료를 크기로 나열 -> 순위 매김 => 차이 비교\n\n표준화 : (각데이터 - 평균) / 표준편차\n모집단이 정규분포를 따르는 경우에,\n평균:0, 표준편차 1인 표준정규분포로 표준화\n1) numpy 2)scipy.state:zscore 3) sklearn.preprocessing\n'

In [260]:
np.random.randint(30, size=(6,5))

array([[ 1,  2, 23, 26, 29],
       [27, 15, 18,  2, 16],
       [17,  6,  4, 26, 13],
       [25,  3, 27, 10, 12],
       [21, 17, 25,  3, 10],
       [12, 11, 21,  5, 25]])