## 결측치

In [1]:
import pandas as pd 

data = {'Name': ['Alice', 'Bob', 'Charie'],
        'Age': [25, None, 30], 
        'Score': [90, 85, None]
       }

df = pd.DataFrame(data) 
df.head()

Unnamed: 0,Name,Age,Score
0,Alice,25.0,90.0
1,Bob,,85.0
2,Charie,30.0,


In [2]:
print(df.isnull().sum())

Name     0
Age      1
Score    1
dtype: int64


In [3]:
print("결측치 확인")
print(df.isnull())

mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)
df['Score'] = df['Score'].fillna(0)

print("\n결측치 처리 후 데이터")    
print(df)

결측치 확인
    Name    Age  Score
0  False  False  False
1  False   True  False
2  False  False   True

결측치 처리 후 데이터
     Name   Age  Score
0   Alice  25.0   90.0
1     Bob  27.5   85.0
2  Charie  30.0    0.0


## 이상치 

In [4]:
import pandas as pd 

data = {'Height': [150, 160, 170, 180, 350]}
df = pd.DataFrame(data)

print("데이터:\n", df)

데이터:
    Height
0     150
1     160
2     170
3     180
4     350


In [5]:
print("최대값", df['Height'].max())
print("최소값", df['Height'].min())

최대값 350
최소값 150


In [8]:
Q1 = df['Height'].quantile(0.25)
Q3 = df['Height'].quantile(0.75) 
IQR = Q3 - Q1 

lower_bound = Q1-1.5*IQR
upper_bound = Q3+1.5*IQR 

outliers = df[(df['Height'] < lower_bound) | (df['Height'] > upper_bound)] 
print('\nIQR 기준으로 찾은 이상치: \n', outliers) 


IQR 기준으로 찾은 이상치: 
    Height
4     350


In [9]:
from scipy.stats import zscore

df['Z_Score'] = zscore(df['Height']) 
print('\nZ-스코어:\n', df) 

outliers = df[df['Z_Score'].abs() > 3]
print('\nZ-스코어 기준으로 찾은 이상치:\n', outliers)


Z-스코어:
    Height   Z_Score
0     150 -0.696373
1     160 -0.562455
2     170 -0.428537
3     180 -0.294619
4     350  1.981985

Z-스코어 기준으로 찾은 이상치:
 Empty DataFrame
Columns: [Height, Z_Score]
Index: []


In [10]:
df_cleaned = df[(df['Height'] >= lower_bound) & (df['Height'] <= upper_bound)] 
print('\n이상치 제거 후 데이터: \n', df_cleaned)


이상치 제거 후 데이터: 
    Height   Z_Score
0     150 -0.696373
1     160 -0.562455
2     170 -0.428537
3     180 -0.294619


In [11]:
median_value = df[(df['Height'] >= lower_bound) & (df['Height'] <= upper_bound)]['Height'].median()
df['Height'] = df['Height'].apply(lambda x: median_value 
                                  if (x < lower_bound or x > upper_bound)
                                  else x)
print('\n이상치를 중앙값으로 대체한 데이터:\n', df)


이상치를 중앙값으로 대체한 데이터:
    Height   Z_Score
0   150.0 -0.696373
1   160.0 -0.562455
2   170.0 -0.428537
3   180.0 -0.294619
4   165.0  1.981985


In [12]:
median_value

np.float64(165.0)

## 수치형 데이터 전처리 

In [13]:
import pandas as pd 

data = {'Age': [15, 22, 35, 50, 72]}
df = pd.DataFrame(data) 

bins = [0, 20, 40, 60, 80] 
labels = ['10대', '20대', '30대', '40대 이상'] 
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels) 

print(df) 

   Age Age_Group
0   15       10대
1   22       20대
2   35       20대
3   50       30대
4   72    40대 이상


In [16]:
df['Age_Quantile'] = pd.qcut(df['Age'], q=3, labels=['하위', '중위', '상위'])
print(df)

   Age Age_Group Age_Quantile
0   15       10대           하위
1   22       20대           하위
2   35       20대           중위
3   50       30대           상위
4   72    40대 이상           상위


In [17]:
from sklearn.preprocessing import MinMaxScaler 
import pandas as pd 

data = {'Height': [150, 160, 170, 180, 190]} 
df = pd.DataFrame(data) 

scaler = MinMaxScaler()
df['Height_Normalized'] = scaler.fit_transform(df[['Height']])

print(df)

   Height  Height_Normalized
0     150               0.00
1     160               0.25
2     170               0.50
3     180               0.75
4     190               1.00


In [18]:
from sklearn.preprocessing import StandardScaler 
import pandas as pd 

data = {'Weight': [50, 60, 70, 80, 90]} 
df = pd.DataFrame(data) 

scaler = StandardScaler() 
df['Weight_standardized'] = scaler.fit_transform(df[['Weight']])

print(df)

   Weight  Weight_standardized
0      50            -1.414214
1      60            -0.707107
2      70             0.000000
3      80             0.707107
4      90             1.414214


In [19]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler, StandardScaler 

data = {'Age': [15, 22, 35, 50, 72], 'Weight': [50, 60, 70, 80, 90 ]}
df = pd.DataFrame(data) 

bins = [0, 20, 40, 60, 80] 
labels = ['10대', '20대', '30대', '40대 이상']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

minmax_scaler = MinMaxScaler()
df['Weight_Normalizard'] = minmax_scaler.fit_transform(df[['Weight']])

std_scaler = StandardScaler() 
df['Weight_Standardized'] = std_scaler.fit_transform(df[['Weight']]) 

print(df) 


   Age  Weight Age_Group  Weight_Normalizard  Weight_Standardized
0   15      50       10대                0.00            -1.414214
1   22      60       20대                0.25            -0.707107
2   35      70       20대                0.50             0.000000
3   50      80       30대                0.75             0.707107
4   72      90    40대 이상                1.00             1.414214


## 범주형 데이처 전처리 
```
서울 -> 0
부산 -> 1
```

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder 

data = {'City': ['서울', '부산', '대구', '서울', '부산']}
df = pd.DataFrame(data) 


In [21]:
encoder = LabelEncoder()
df['City_Encoded'] = encoder.fit_transform(df['City'])
print(df)

  City  City_Encoded
0   서울             2
1   부산             1
2   대구             0
3   서울             2
4   부산             1


In [22]:
import pandas as pd 

data = {'City': ['서울', '부산', '대구', '서울', '부산']}
df = pd.DataFrame(data)

df_encoded = pd.get_dummies(df, columns=['City']) 
#df['City_대구', 'City_부산', 'City_서울'] = 
print(df_encoded)

   City_대구  City_부산  City_서울
0    False    False     True
1    False     True    False
2     True    False    False
3    False    False     True
4    False     True    False


## 기존 데이터 삭제

In [23]:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
       'Age': [25, 30, 35, 40],
       'Score': [90, 85, 75, 60]}

In [24]:
import pandas as pd 

df = pd.DataFrame(data)

df_dropped = df.drop(index=2)

print('특정 행 삭제 후 데이터:')
print(df_dropped)

특정 행 삭제 후 데이터:
    Name  Age  Score
0  Alice   25     90
1    Bob   30     85
3  David   40     60


In [25]:
df_dropped = df.drop(columns = ['Score'])
print(df_dropped)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40


In [26]:
data_with_na = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
       'Age': [25, None, 35, 40],
       'Score': [90, 85, None, 60]}
df_with_na = pd.DataFrame(data_with_na) 
df_with_na

Unnamed: 0,Name,Age,Score
0,Alice,25.0,90.0
1,Bob,,85.0
2,Charlie,35.0,
3,David,40.0,60.0


In [27]:
df_clean = df_with_na.dropna()
print('\n결측치가 포함된 행 삭제 후 데이터') 
print(df_clean)


결측치가 포함된 행 삭제 후 데이터
    Name   Age  Score
0  Alice  25.0   90.0
3  David  40.0   60.0


In [28]:
data_with_duplicates = {'Name': ['Alice', 'Bob', 'Alice', 'David'],
                        'Age': [25, 30, 25, 40],
                        'Score': [90, 85, 90, 60]}
df_with_duplicates = pd.DataFrame(data_with_duplicates) 
df_with_duplicates

Unnamed: 0,Name,Age,Score
0,Alice,25,90
1,Bob,30,85
2,Alice,25,90
3,David,40,60


In [29]:
df_unique = df_with_duplicates.drop_duplicates()
print('\n중복 데이터 삭제 후 데이터:')
print(df_unique) 


중복 데이터 삭제 후 데이터:
    Name  Age  Score
0  Alice   25     90
1    Bob   30     85
3  David   40     60


#### 실습

In [30]:
import pandas as pd 

data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Age': [25, None, 35, 40, 29],
        'Score': [90, 85, 75, 60, 85]}
df = pd.DataFrame(data)
print(df)

df = df.drop(index=1)
df = df.drop(columns=['Score'])
df = df.dropna()

print('\n', df)
  

      Name   Age  Score
0    Alice  25.0     90
1      Bob   NaN     85
2  Charlie  35.0     75
3    David  40.0     60
4      Eve  29.0     85

       Name   Age
0    Alice  25.0
2  Charlie  35.0
3    David  40.0
4      Eve  29.0


In [31]:
import pandas as pd 

data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Age': [25, None, 35, 40, 29],
        'Score': [90, 85, 75, 60, 85]}
df = pd.DataFrame(data)
print(df)

df_drop = df.drop(index=1, columns=['Score'])
df_dropna = df.dropna()

print('\n', df_drop, '\n\n', df_dropna)

      Name   Age  Score
0    Alice  25.0     90
1      Bob   NaN     85
2  Charlie  35.0     75
3    David  40.0     60
4      Eve  29.0     85

       Name   Age
0    Alice  25.0
2  Charlie  35.0
3    David  40.0
4      Eve  29.0 

       Name   Age  Score
0    Alice  25.0     90
2  Charlie  35.0     75
3    David  40.0     60
4      Eve  29.0     85
