In [10]:
# Scaling

import pandas as pd
from sklearn.preprocessing import scale, minmax_scale

x = pd.DataFrame({'col' : [-3, -1, 1, 3, 5, 7, 9]})

# 평균 0, 분산 1을 이용하여 정규화
# astype(float)은 scale의 입력이 float이므로 warning 방지를 위해 변환
x['scale'] = scale(x.col.astype(float)) # Z-score normalization
x['minmax_scale'] = minmax_scale(x.col.astype(float)) # Min_max normalization
print(x)

x.describe()

   col  scale  minmax_scale
0   -3   -1.5      0.000000
1   -1   -1.0      0.166667
2    1   -0.5      0.333333
3    3    0.0      0.500000
4    5    0.5      0.666667
5    7    1.0      0.833333
6    9    1.5      1.000000


Unnamed: 0,col,scale,minmax_scale
count,7.0,7.0,7.0
mean,3.0,0.0,0.5
std,4.320494,1.080123,0.360041
min,-3.0,-1.5,0.0
25%,0.0,-0.75,0.25
50%,3.0,0.0,0.5
75%,6.0,0.75,0.75
max,9.0,1.5,1.0


In [11]:
# MinMaxScaler

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

dfTest = pd.DataFrame({'A' : [14.00, 90.20, 90.95, 96.27, 91.21],
                      'B' : [103.02, 107.26, 110.35, 114.23, 114.68],
                      'C' : ['big', 'small', 'big', 'small', 'small']})

dfTest[['A','B']] = scaler.fit_transform(dfTest[['A','B']])
print(dfTest)

          A         B      C
0  0.000000  0.000000    big
1  0.926219  0.363636  small
2  0.935335  0.628645    big
3  1.000000  0.961407  small
4  0.938495  1.000000  small


In [12]:
# Transform

from sklearn import preprocessing
le = preprocessing.LabelEncoder() # LabelEncoder 객체 생성

le.fit(['paris', 'paris', 'tokyo', 'amsterdam'])
print(le.classes_)
print(type(le.classes_), "\n")

data = le.transform(['paris', 'paris', 'tokyo', 'amsterdam'])
print(data)
print(type(data), "\n")

# 위 method를 한 번에 처리
data2 = le.fit_transform(['paris', 'paris', 'tokyo', 'amsterdam'])
print(data2)
print(type(data2), "\n")

# fit_transform <-> inverse_transform
original = le.inverse_transform([2, 2, 1])
print(original)
print(type(data))

['amsterdam' 'paris' 'tokyo']
<class 'numpy.ndarray'> 

[1 1 2 0]
<class 'numpy.ndarray'> 

[1 1 2 0]
<class 'numpy.ndarray'> 

['tokyo' 'tokyo' 'paris']
<class 'numpy.ndarray'>


In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

str = []
for i in range(ord('a'), ord('z')+1): # ord('a'): 'a'의 ascii code
    str.append(chr(i)) # chr('i'): ascii code i에 해당하는 문자
print(str)

le.fit(str)
data = le.transform(['q', 'a', 'z'])
print(data)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[16  0 25]


In [14]:
# DataFrame Transform

import pandas as pd
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df = pd.DataFrame({'A' : ['a', 'b', 'b', 'c', 'a'],
                  'B' : ['x', 'y', 'x', 'y', 'x']})

# df.apply는 DataFram에서 인자로 주어진 함수를 각 Column에 적용하는 함수
data = df.apply(le.fit_transform)
print(data)
print(type(data), "\n")

   A  B
0  0  0
1  1  1
2  1  0
3  2  1
4  0  0
<class 'pandas.core.frame.DataFrame'> 



In [1]:
# One-hot encoding

import pandas as pd

df = pd.DataFrame({'country' : ['russia', 'germany', 'australia', 'korea', 'germany']})
a = pd.get_dummies(df, prefix = ['country'])
print(a)

b = pd.get_dummies(df)
print(b)

   country_australia  country_germany  country_korea  country_russia
0                  0                0              0               1
1                  0                1              0               0
2                  1                0              0               0
3                  0                0              1               0
4                  0                1              0               0
<class 'pandas.core.frame.DataFrame'>
   country_australia  country_germany  country_korea  country_russia
0                  0                0              0               1
1                  0                1              0               0
2                  1                0              0               0
3                  0                0              1               0
4                  0                1              0               0


In [16]:
df = pd.DataFrame({'A' : ['a', 'b', 'a'],
                   'B' : ['b', 'a', 'c']})

one_hot = pd.get_dummies(df['B'])
df = df.drop('B', axis = 1) # axis = 0은 index(row), axis = 1은 column
df = df.join(one_hot)
print(df)

   A  a  b  c
0  a  0  1  0
1  b  1  0  0
2  a  0  0  1


In [17]:
df = pd.DataFrame({'A' : ['a', 'b', 'b', 'c', 'a'],
                  'B' : ['x', 'y', 'x', 'y', 'x']})
a = pd.get_dummies(df, prefix = ['A', 'B'])
print(a)

   A_a  A_b  A_c  B_x  B_y
0    1    0    0    1    0
1    0    1    0    0    1
2    0    1    0    1    0
3    0    0    1    0    1
4    1    0    0    1    0


In [18]:
df = pd.DataFrame({'A' : ['a', 'b', 'b', 'c', 'a'],
                  'B' : [3, 4, 7, 2, 5]})
a = pd.get_dummies(df) # prefix = ['A']가 기본값
print(a)

   B  A_a  A_b  A_c
0  3    1    0    0
1  4    0    1    0
2  7    0    1    0
3  2    0    0    1
4  5    1    0    0


In [19]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

x1 = pd.DataFrame({'country' : ['russia', 'germany', 'australia', 'korea', 'germany']})

# DataFrame 전체를 라벨인코딩(숫자로 변환)한 후, one-hot encoding을 해야함
le = LabelEncoder()
x2 = x1.apply(le.fit_transform)
print(x2)
print(type(x2))

encoder = OneHotEncoder()
x2 = encoder.fit_transform(x2) # sparse matrix로 변환됨
print(x2)
print(type(x2))

x3 = x2.toarray() # numpy array로 변환, 추후에 DataFrame으로 변환
print(x3)
print(type(x3))

   country
0        3
1        1
2        0
3        2
4        1
<class 'pandas.core.frame.DataFrame'>
  (0, 3)	1.0
  (1, 1)	1.0
  (2, 0)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
<class 'scipy.sparse.csr.csr_matrix'>
[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]
<class 'numpy.ndarray'>


In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

x1 = pd.DataFrame({'country' : ['russia', 'germany', 'australia', 'korea', 'germany']})

encoder = OneHotEncoder(categories = 'auto') # LabelEncoder 사용 필요x
x2 = encoder.fit_transform(x1)
print(x2)
print(type(x2))

  (0, 3)	1.0
  (1, 1)	1.0
  (2, 0)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
<class 'scipy.sparse.csr.csr_matrix'>


In [25]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame({'A' : ['a', 'b', 'b', 'c', 'a'],
                  'B' : ['x', 'y', 'x', 'y', 'x']})

encoder = OneHotEncoder(categories = 'auto')
x2 = encoder.fit_transform(x1)
print(x2)
print(type(x2))

x3 = x2.toarray()
print(x3)
print(type(x3))

x4 = pd.DataFrame(x3)
print(x4)
print(type(x4))

  (0, 3)	1.0
  (1, 1)	1.0
  (2, 0)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
<class 'scipy.sparse.csr.csr_matrix'>
[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]
<class 'numpy.ndarray'>
     0    1    2    3
0  0.0  0.0  0.0  1.0
1  0.0  1.0  0.0  0.0
2  1.0  0.0  0.0  0.0
3  0.0  0.0  1.0  0.0
4  0.0  1.0  0.0  0.0
<class 'pandas.core.frame.DataFrame'>


In [22]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

x = pd.DataFrame({'A' : ['a', 'b', 'b', 'c', 'a'],
                 'B' : [3, 4, 5, 1, 7]})

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(), [0])], remainder = 'passthrough'
) # ColumnTransformer 사용 권장

x = ct.fit_transform(x)
print(x)
print(type(x))
print()

[[1. 0. 0. 3.]
 [0. 1. 0. 4.]
 [0. 1. 0. 5.]
 [0. 0. 1. 1.]
 [1. 0. 0. 7.]]
<class 'numpy.ndarray'>



In [23]:
import pandas as pd

dataset = pd.read_csv('G:/내 드라이브/3-1/인공지능/py/pima-indians-diabetes.csv', header = None)
print(dataset[0:10], "\n")
print((dataset==0).sum())

    0    1   2   3    4     5      6   7  8
0   6  148  72  35    0  33.6  0.627  50  1
1   1   85  66  29    0  26.6  0.351  31  0
2   8  183  64   0    0  23.3  0.672  32  1
3   1   89  66  23   94  28.1  0.167  21  0
4   0  137  40  35  168  43.1  2.288  33  1
5   5  116  74   0    0  25.6  0.201  30  0
6   3   78  50  32   88  31.0  0.248  26  1
7  10  115   0   0    0  35.3  0.134  29  0
8   2  197  70  45  543  30.5  0.158  53  1
9   8  125  96   0    0   0.0  0.232  54  1 

0    111
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8    500
dtype: int64


In [4]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('G:/내 드라이브/3-1/인공지능/py/pima-indians-diabetes.csv', header = None)
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, np.NaN)

print(dataset.head(10))

    0      1     2     3      4     5      6   7  8
0   6  148.0  72.0  35.0    NaN  33.6  0.627  50  1
1   1   85.0  66.0  29.0    NaN  26.6  0.351  31  0
2   8  183.0  64.0   NaN    NaN  23.3  0.672  32  1
3   1   89.0  66.0  23.0   94.0  28.1  0.167  21  0
4   0  137.0  40.0  35.0  168.0  43.1  2.288  33  1
5   5  116.0  74.0   NaN    NaN  25.6  0.201  30  0
6   3   78.0  50.0  32.0   88.0  31.0  0.248  26  1
7  10  115.0   NaN   NaN    NaN  35.3  0.134  29  0
8   2  197.0  70.0  45.0  543.0  30.5  0.158  53  1
9   8  125.0  96.0   NaN    NaN   NaN  0.232  54  1


In [5]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('G:/내 드라이브/3-1/인공지능/py/pima-indians-diabetes.csv', header = None)
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, np.NaN)

print(dataset.shape)

dataset.dropna(inplace = True) # inplace = True는 original dataset을 수정o

print(dataset.shape)

(768, 9)
(392, 9)


In [6]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('G:/내 드라이브/3-1/인공지능/py/pima-indians-diabetes.csv', header = None)
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, np.NaN)

print(dataset.mean())

dataset.fillna(dataset.mean(), inplace = True)


print(dataset.isnull().sum())
print(dataset.head(10))

0      3.845052
1    121.686763
2     72.405184
3     29.153420
4    155.548223
5     32.457464
6      0.471876
7     33.240885
8      0.348958
dtype: float64
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64
    0      1          2         3           4          5      6   7  8
0   6  148.0  72.000000  35.00000  155.548223  33.600000  0.627  50  1
1   1   85.0  66.000000  29.00000  155.548223  26.600000  0.351  31  0
2   8  183.0  64.000000  29.15342  155.548223  23.300000  0.672  32  1
3   1   89.0  66.000000  23.00000   94.000000  28.100000  0.167  21  0
4   0  137.0  40.000000  35.00000  168.000000  43.100000  2.288  33  1
5   5  116.0  74.000000  29.15342  155.548223  25.600000  0.201  30  0
6   3   78.0  50.000000  32.00000   88.000000  31.000000  0.248  26  1
7  10  115.0  72.405184  29.15342  155.548223  35.300000  0.134  29  0
8   2  197.0  70.000000  45.00000  543.000000  30.500000  0.158  53  1
9   8  125.0  96.000000  29.15342  155.548223  32.45746