In [93]:
import seaborn as sns
df = sns.load_dataset('penguins')
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [94]:
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [95]:
#1. 결측치 제거
missing = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
for i in missing:
    df[i] = df[i].fillna(df[i].median())
df['sex'] = df['sex'].fillna('Male')

In [96]:
df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [97]:
df['sex'].value_counts()

Male      179
Female    165
Name: sex, dtype: int64

In [98]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [99]:
from sklearn.preprocessing import LabelEncoder
label = ['species', 'island', 'sex']
df[label] = df[label].apply(LabelEncoder().fit_transform)

In [100]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
3,0,2,44.45,17.3,197.0,4050.0,1
4,0,2,36.7,19.3,193.0,3450.0,0


In [101]:
#3. 데이터 변환, 더미처리
import pandas as pd
category = ['island', 'sex']
for i in category:
    df[i] = df[i].astype('category')
df = pd.get_dummies(df)

In [102]:
df.dtypes

species                int64
bill_length_mm       float64
bill_depth_mm        float64
flipper_length_mm    float64
body_mass_g          float64
island_0               uint8
island_1               uint8
island_2               uint8
sex_0                  uint8
sex_1                  uint8
dtype: object

In [103]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,0,1
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0


In [104]:
#4. 파생변수
df['body_mass_g_qcut'] = pd.qcut(df['body_mass_g'], 5, labels=False)
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,39.1,18.7,181.0,3750.0,0,0,1,0,1,1
1,0,39.5,17.4,186.0,3800.0,0,0,1,1,0,1
2,0,40.3,18.0,195.0,3250.0,0,0,1,1,0,0
3,0,44.45,17.3,197.0,4050.0,0,0,1,0,1,2
4,0,36.7,19.3,193.0,3450.0,0,0,1,1,0,0


In [105]:
df['body_mass_g_qcut'].value_counts()

0    71
1    70
2    68
4    68
3    67
Name: body_mass_g_qcut, dtype: int64

In [106]:
#5. 스케일
#help('sklearn.preprocessing')
from sklearn.preprocessing import MinMaxScaler
scaler = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
min = MinMaxScaler()
min.fit(df[scaler])
df[scaler] = min.transform(df[scaler])

In [107]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_0,island_1,island_2,sex_0,sex_1,body_mass_g_qcut
0,0,0.254545,0.666667,0.152542,0.291667,0,0,1,0,1,1
1,0,0.269091,0.511905,0.237288,0.305556,0,0,1,1,0,1
2,0,0.298182,0.583333,0.389831,0.152778,0,0,1,1,0,0
3,0,0.449091,0.5,0.423729,0.375,0,0,1,0,1,2
4,0,0.167273,0.738095,0.355932,0.208333,0,0,1,1,0,0


In [108]:
#6. 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df['species'], test_size=0.2, stratify=df['species'], random_state=1)

In [109]:
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

X_train (275, 10)
X_test (69, 10)
y_train (275,)
y_test (69,)


In [112]:
print(df.iloc[2:, 3:])

     flipper_length_mm  body_mass_g  island_0  island_1  island_2  sex_0  \
2             0.389831     0.152778         0         0         1      1   
3             0.423729     0.375000         0         0         1      0   
4             0.355932     0.208333         0         0         1      1   
5             0.305085     0.263889         0         0         1      0   
6             0.152542     0.256944         0         0         1      1   
..                 ...          ...       ...       ...       ...    ...   
339           0.423729     0.375000         1         0         0      0   
340           0.728814     0.597222         1         0         0      1   
341           0.847458     0.847222         1         0         0      0   
342           0.677966     0.694444         1         0         0      1   
343           0.694915     0.750000         1         0         0      0   

     sex_1  body_mass_g_qcut  
2        0                 0  
3        1               