In [3]:
import numpy as np
import pandas as pd
from io import StringIO  #문자열을 파일처럼 사용할 수 이 있도록 지원

In [4]:
csv_data = """A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
9.0,10.0,11.0,
"""
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,9.0,10.0,11.0,


In [5]:
 df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      float64
 2   C       2 non-null      float64
 3   D       2 non-null      float64
dtypes: float64(4)
memory usage: 224.0 bytes


In [6]:
df.isna()  # == df.isnull() 
df.notna() # ==df.notnull()
df.isna().sum() # (NA) True : 1, (NotNA) False :0   nan 개수 확인 가능 / 행(세로) 방향으로 계산.
df.isna().sum(axis=1) # 열(가로) 방향으로 계산

0    0
1    1
2    1
dtype: int64

In [7]:
# NaN 제거
df.dropna(axis=1) # axis=0 : 행삭제 , axis=1 : 열삭제
df.dropna(how='all') # all : 모든 값이 NaN 이면 삭제, any: 하나의 값이라도 NaN이면 삭제
df.dropna(thresh=4) # thresh : 정상 값이 최소 개수 
df.dropna(subset=["C"]) # C 컬럼에 대해서만 NaN 검사 수행 & 행 제거

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,9.0,10.0,11.0,


In [8]:
# print(df) # 전체데리터 출력
# print(df.mean(axis=0)) # 각 열의 평균값 계산
# print(df['C'].isnull()) # C열의 NaN 위치 찾기
# df.loc[df['C'].isnull(),"C"] = 7 # C열의 NaN 위치에 편균값 적용
# df

In [9]:
# !pip install scikit-learn

In [10]:
# 결측치를 다른 값으로 대체
from sklearn.impute import SimpleImputer

# simr = SimpleImputer(strategy="mean")
# simr = SimpleImputer(strategy="median")       
simr = SimpleImputer(strategy="constant", fill_value =100)
simr.fit(df) # 지정된 알고리즘에 따라서 모델을 만드세요
imputed_data = simr.transform(df) # 만들어진 모델을 사용해서 데이터 변환
imputed_data

imputed_df = pd.DataFrame(imputed_data, columns=df.columns, index=df.index)
imputed_df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,100.0,8.0
2,9.0,10.0,11.0,100.0


In [11]:
df2 = pd.DataFrame([ ['green', 'M', 10.1, 'class1'], 
                     ['red', 'L', 13.5, 'class2'], 
                     ['blue', 'XL', 15.3, 'class1'],
                     ['purple', 'L', 14.7, 'class3'] ]) 

df2.columns = ['color', 'size', 'price', 'classlabel'] 
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1
3,purple,L,14.7,class3


In [12]:
np.unique(df2["classlabel"])
np.unique(df2["classlabel"], return_counts=True)

(array(['class1', 'class2', 'class3'], dtype=object),
 array([2, 1, 1], dtype=int64))

In [13]:
# class_mapping = {'class1': 0, 'class2': 1, 'class3': 2}
class_mapping = {label: idx 
                 for idx, label  
                 in enumerate(np.unique(df2['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1, 'class3': 2}

In [14]:
df2['classlabel'] = df2['classlabel'].map(class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0
3,purple,L,14.7,2


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()
#le.fit(df2['classlabel]) #알고리즘과 데이터를 사용해서 모델 만들기
# le.transform(df2['classlabel]) #모델을 사용해서 데이터 변환

df2['classlabel'] = le.fit_transform(df2['classlabel']) #모델만들기 +변환
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0
3,purple,L,14.7,2


In [17]:
df2['classlabel'] = le.inverse_transform(df2['classlabel'])
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0
3,purple,L,14.7,2


In [18]:
#  범주형 데이터 전처리
from sklearn.preprocessing import OneHotEncoder

In [22]:
ohe = OneHotEncoder()
ohe.fit(df2[['color']].values)
result = ohe.transform(df2[['color']].values)
print(ohe.categories_)
# print(result)
result.toarray()

[array(['blue', 'green', 'purple', 'red'], dtype=object)]


array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

In [20]:
pd.get_dummies(df2['color'])

Unnamed: 0,blue,green,purple,red
0,0,1,0,0
1,0,0,0,1
2,1,0,0,0
3,0,0,1,0


In [25]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [28]:
df_wine.info()
df_wine.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Class label                   178 non-null    int64  
 1   Alcohol                       178 non-null    float64
 2   Malic acid                    178 non-null    float64
 3   Ash                           178 non-null    float64
 4   Alcalinity of ash             178 non-null    float64
 5   Magnesium                     178 non-null    int64  
 6   Total phenols                 178 non-null    float64
 7   Flavanoids                    178 non-null    float64
 8   Nonflavanoid phenols          178 non-null    float64
 9   Proanthocyanins               178 non-null    float64
 10  Color intensity               178 non-null    float64
 11  Hue                           178 non-null    float64
 12  OD280/OD315 of diluted wines  178 non-null    float64
 13  Proli

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [30]:
print('Class labels', np.unique(df_wine['Class label'])) 

Class labels [1 2 3]


In [36]:
# df_wine[df_wine["Class label"]==1].shape
# df_wine[df_wine["Class label"]==2].shape
df_wine[df_wine["Class label"]==3].shape

(48, 14)

In [45]:
#훈련 데이터와 테스트 데이터 분할

from sklearn.model_selection import train_test_split

y = df_wine.iloc[: , 0] 
X = df_wine.iloc[: , 1:]   

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=42) 

In [50]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, df_wine.shape

((124, 13), (54, 13), (124,), (54,), (178, 14))

In [56]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

X = df_wine.iloc[:, 1:]
# scaler = MinMaxScaler()
scaler = StandardScaler()
X2 = scaler.fit_transform(X)
X[:5]

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [57]:
pd.DataFrame(X2[:5], columns=X.columns)

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


In [61]:
np.mean(X2[: ,0])   # 0번째 컬럼의 전체 평균
np.std(X2[:,0]) # 0번째 컬럼의 전체 표준편차
np.mean(X2[: ,0]), np.std(X2[:,0])

(-8.382807556720283e-16, 1.0)

In [None]:
# 1. Nan -> 제거(행 또는 열), 대체 (mean, medium)
# 2. Category Data -> LabelEncoding(순위), OneHotEncoding9(순위x ->명목형)
# 3. Scaling -> MinMaxScaler, StandardScaler
# 4. 기존 컬럼 제거 or 새 컬럼 추가
# 1~3 순서대로 진행해서 train 데이터 , test데이터 나눠 패턴찾아 진행. 
#     (Test / Train -> split)