In [1]:
# 결측치 ( Missing Value ) 
# : 데이터가 수집되지 않거나 누락되어 정보 (값) 가 존재하지 않음을 의미 
# => 결측치가 포함된 데이턴ㄴ 모델 학습이 불가능하므로 사전에 반드시 결측치 처리를 진행

In [2]:
# 결측치 발생 원인 
# : 대부분 수집 및 관리 과정에서 결측치 발생

In [3]:
# 미수집 : 미 입력된 데이터를 수집 및 저장

In [4]:
# 시스템 오류 : 오류에 의해 누락되어 수집 및 저장 
# 신규 항목 : 새롭게 수집 및 저장하는 항목이 추가됨

In [5]:
#결측치 처리 방안
# 제거하기 vs 대체하기 

In [6]:
# 제거하기 : 가장 쉬운 처리 방안 , 엄청난 데이터 손실 발생 
# 대체하기 : 최대한 많은 데이터 활용, 편향 (Bias) 발생 가능

In [7]:
#결측치가 생기는 원인은 어떠한 처리 방안으로 할 지 정하는 주요한 사항

In [8]:
# 결측치 제거하기 
# Listwise deletion 
# 단 하나라도 결측치가 존재하는 행 자체를 삭제하는 방안

# Pairwise deletion 
# 모든 변수가 결측치로만 존재하는 행 삭제

In [9]:
#Listwise를 고려할 때는 전체 데이터 수와 결측 데이터 수를 사전에 파악해서 적용할 필요가 있음

In [10]:
# 결측치 대체하기 
# 정보의 손실을 방지하나 변수 특성(평균, 상관관계 등) 에 영향 발생

In [11]:
# 일정 값 대체
# : 결측치를 각 변수의 평균값으로 대체 
# : 대체로 어떤 값을 진행할 지 선택하고 고려해야 함 

# 선형 값 대체 
# 선형 함수 기반 앞뒤 관측치 활용 대체 

In [12]:
import numpy as np
import pandas as pd

In [17]:
# 데이터 로딩 및 개요 확인 
cancer = pd.read_csv("./data/wdbc.data", header=None)
cancer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [20]:
cancer.columns = ["id", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean", 
                  "concave_poins_mean", "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se",
                  "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se", 
                  "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst",
                  "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"]

# ID를 index화
cancer = cancer.set_index('id')
cancer

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_poins_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [23]:
#데이터 복사 
cancer_data = cancer.copy()

# 데이터 내 결측치 생성
# 실습을 위한 일부 데이터 생성 
cancer_data = cancer_data[0:30]
cancer_data = cancer_data[['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean']]


# 결측치 생성
# 6개 record 내 결측치 생성 
cancer_data.iloc[2,:] = np.nan # 3행 내 모든 데이터 결측치 생성

cancer_data.iloc[5,0] = np.nan # 6행 내 1열 데이터 결측치 생성
cancer_data.iloc[10,[3,4]] = np.nan # 11행 내 4,5 열 데이터 결측치 생성
cancer_data.iloc[12,2:4] = np.nan # 13행 내 3,4열 데이터 결측치 생성
cancer_data.iloc[15, [0,3]] = np.nan # 16행 내 1열, 4열 데이터 결측치 생성
cancer_data.iloc[24,4] = np.nan # 25행 내 5열 데이터 결측치 생성

cancer_data

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [24]:
# listwise 

# 데이터 개요 
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       27 non-null     object 
 1   radius_mean     29 non-null     float64
 2   texture_mean    28 non-null     float64
 3   perimeter_mean  26 non-null     float64
 4   area_mean       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [27]:
# listwise deletion 수행
# 30개 record 중, 6개 record에서 결측치 존재함 
cancer_copy = cancer_data.copy()
cancer_copy = cancer_copy.dropna()

In [28]:
print(cancer_copy.info())

<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       24 non-null     object 
 1   radius_mean     24 non-null     float64
 2   texture_mean    24 non-null     float64
 3   perimeter_mean  24 non-null     float64
 4   area_mean       24 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.1+ KB
None


In [29]:
print("데이터 차원 : " , np.shape(cancer_copy))

데이터 차원 :  (24, 5)


In [30]:
cancer_copy

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9
84610002,M,15.78,17.89,103.6,781.0
846381,M,15.85,23.95,103.7,782.7


In [31]:
# pairwise

In [32]:
# pairswise deletion 수행 
# 30개 record 중, 1개 record에서 모든 변수 내 결측치 존재
# 모든 결측치 존재 record만 삭제

cancer_copy = cancer_data.copy()
cancer_copy = cancer_copy.dropna(how = 'all')

# 데이터 요약 : 총 30개 record 중, 1개 record 삭제 
print( cancer_copy.info())

# 데이터 차원 확인 
print("데이터 차원 : " , np.shape(cancer_copy))

<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       27 non-null     object 
 1   radius_mean     29 non-null     float64
 2   texture_mean    28 non-null     float64
 3   perimeter_mean  26 non-null     float64
 4   area_mean       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB
None
데이터 차원 :  (29, 5)


In [33]:
cancer_copy

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9
845636,M,16.02,23.24,,


In [38]:
# 결측치 대체하기 

# 1. 일정 값 대체 : 결측치를 사전 지정 값으로 대체 
# 2. 선형 값 대체 : 선형 함수 기반, 앞뒤 record 값을 활용하여 값 대체 

# 결측치 데이터 확인 
cancer_copy = cancer_data.copy()
cancer_copy

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [39]:
# 일정 값 대체 
# diagnosis 컬럼 내 결측치는 C라는 범주형 값 일괄 대체 
cancer_copy['diagnosis'] = cancer_copy['diagnosis'].fillna('C')
cancer_copy.head(10)

# 수치형 컬럼인 radius_mean 컬럼 내 결측치는 65라는 수치의 일정 값으로 대체 
cancer_copy['radius_mean'] = cancer_copy['radius_mean'].fillna(65)
cancer_copy.head(10)

# 데이터 개요 확인 
cancer_copy.info()
# 대체를 수행한 diagnosis, radius_mean 컬럼과 그 이외 컬럼별 결측치 여부 확인 

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 842302 to 853201
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   diagnosis       30 non-null     object 
 1   radius_mean     30 non-null     float64
 2   texture_mean    28 non-null     float64
 3   perimeter_mean  26 non-null     float64
 4   area_mean       27 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.4+ KB


In [41]:
# 일정 값을 지정 값이 아닌, 컬럼의 평균으로 대체 (평균, 중앙, 최소, 최대값 등으로 대체 가능)
# texture_mean 컬럼 내 결측치를 texture_mean 평균 값으로 대체 
cancer_copy['texture_mean'] = cancer_copy['texture_mean'].replace(np.nan, cancer_copy['texture_mean'].mean())

## 동일결과 (fillna 함수 활용)
## cancer_copy['texture_mean'] = cancer_copy['texture_mean'].fillna(cancer_copy['texture_mean'].mean())

# 대체된 값과 texture_mean 컬럼의 평균값 비교
# 3번째 record, id 84300903 확인
print(cancer_copy['texture_mean'].mean())
cancer_copy.head(10)

19.397142857142853


Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,C,65.0,19.397143,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0
843786,C,12.45,15.7,82.57,477.1
844359,M,18.25,19.98,119.6,1040.0
84458202,M,13.71,20.83,90.2,577.9
844981,M,13.0,21.82,87.5,519.8
84501001,M,12.46,24.04,83.97,475.9


In [42]:
cancer_copy = cancer_data.copy()
cancer_copy.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,,,,
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0


In [43]:
# 선형보간법
cancer_copy = cancer_copy.interpolate()
cancer_copy.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
842302,M,17.99,10.38,122.8,1001.0
842517,M,20.57,17.77,132.9,1326.0
84300903,,15.995,19.075,105.24,856.05
84348301,M,11.42,20.38,77.58,386.1
84358402,M,20.29,14.34,135.1,1297.0


In [44]:
# 선형 값 대체 확인
# radius_mean 컬럼
# id 843009303 경우
# 평균 값 대체 확인
print((cancer_data.iloc[1,1] + cancer_data.iloc[3,1]) / 2)

15.995000000000001


In [45]:
# 적절한 결측치 처리 방법에 따라 향후 방향이 달라지기에 중요함. 