In [15]:
# 데이터 파일 읽기 예제
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = sns.load_dataset('mpg')
X_train, X_test, y_train, y_test = train_test_split(df, df['mpg'], test_size=0.2, random_state=42)
X_train = X_train.drop(['mpg'], axis=1)
X_test = X_test.drop(['mpg'], axis=1)


In [16]:
# 내역 확인
# X_train.head()
X_train
# 결측치 확인
# print(X_train.isna().sum())

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
3,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
18,4,97.0,88.0,2130,14.5,70,japan,datsun pl510
376,4,91.0,68.0,2025,18.2,82,japan,mazda glc custom l
248,4,91.0,60.0,1800,16.4,78,japan,honda civic cvcc
177,4,115.0,95.0,2694,15.0,75,europe,audi 100ls
...,...,...,...,...,...,...,...,...
71,3,70.0,97.0,2330,13.5,72,japan,mazda rx2 coupe
106,8,350.0,180.0,4499,12.5,73,usa,oldsmobile vista cruiser
270,4,134.0,95.0,2515,14.8,78,japan,toyota celica gt liftback
348,4,89.0,62.0,2050,17.3,81,japan,toyota tercel


In [17]:
print(X_train.isna().sum())

cylinders       0
displacement    0
horsepower      5
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [18]:
# 사용자 코딩
#1. 결측치 제거
X_train['horsepower'] = X_train['horsepower'].fillna(X_train['horsepower'].median())
X_test['horsepower'] = X_test['horsepower'].fillna(X_test['horsepower'].median())

print(X_train.isna().sum())

cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64


In [19]:
X_train['origin']

3         usa
18      japan
376     japan
248     japan
177    europe
        ...  
71      japan
106       usa
270     japan
348     japan
102    europe
Name: origin, Length: 318, dtype: object

In [20]:
#2. 라벨인코더
label = ['origin', 'name']
from sklearn.preprocessing import LabelEncoder
X_train[label] = X_train[label].apply(LabelEncoder().fit_transform)
X_test[label] = X_test[label].apply(LabelEncoder().fit_transform)

print(X_train.head())

     cylinders  displacement  horsepower  weight  acceleration  model_year  \
3            8         304.0       150.0    3433          12.0          70   
18           4          97.0        88.0    2130          14.5          70   
376          4          91.0        68.0    2025          18.2          82   
248          4          91.0        60.0    1800          16.4          78   
177          4         115.0        95.0    2694          15.0          75   

     origin  name  
3         2    10  
18        1    78  
376       1   149  
248       1   143  
177       0    13  


In [21]:
print(X_train[label])

     origin  name
3         2    10
18        1    78
376       1   149
248       1   143
177       0    13
..      ...   ...
71        1   153
106       2   171
270       1   216
348       1   226
102       0   236

[318 rows x 2 columns]


In [22]:
#3. 카테고리 변환, 더미처리
category = ['origin']
for i in category:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [23]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2
3,8,304.0,150.0,3433,12.0,70,10,0,0,1
18,4,97.0,88.0,2130,14.5,70,78,0,1,0
376,4,91.0,68.0,2025,18.2,82,149,0,1,0
248,4,91.0,60.0,1800,16.4,78,143,0,1,0
177,4,115.0,95.0,2694,15.0,75,13,1,0,0
...,...,...,...,...,...,...,...,...,...,...
71,3,70.0,97.0,2330,13.5,72,153,0,1,0
106,8,350.0,180.0,4499,12.5,73,171,0,0,1
270,4,134.0,95.0,2515,14.8,78,216,0,1,0
348,4,89.0,62.0,2050,17.3,81,226,0,1,0


In [24]:
X_test

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2
198,4,91.0,53.0,1795,17.4,76,38,0,1,0
396,4,120.0,79.0,2625,18.6,82,36,0,0,1
33,6,232.0,100.0,2634,13.0,71,2,0,0,1
208,8,318.0,150.0,3940,13.2,76,56,0,0,1
93,8,318.0,150.0,4237,14.5,73,52,0,0,1
...,...,...,...,...,...,...,...,...,...,...
249,8,260.0,110.0,3365,15.5,78,44,0,0,1
225,6,250.0,110.0,3520,16.4,77,13,0,0,1
367,4,112.0,88.0,2605,19.6,82,9,0,0,1
175,4,90.0,70.0,1937,14.0,75,72,1,0,0


In [25]:
#4. 파생변수 만들기
X_train['horsepower_qcut'] = pd.qcut(X_train['horsepower'], 5, labels=False)
X_test['horsepower_qcut'] = pd.qcut(X_test['horsepower'], 5, labels=False)

In [26]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
3,8,304.0,150.0,3433,12.0,70,10,0,0,1,4
18,4,97.0,88.0,2130,14.5,70,78,0,1,0,1
376,4,91.0,68.0,2025,18.2,82,149,0,1,0,0
248,4,91.0,60.0,1800,16.4,78,143,0,1,0,0
177,4,115.0,95.0,2694,15.0,75,13,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
71,3,70.0,97.0,2330,13.5,72,153,0,1,0,2
106,8,350.0,180.0,4499,12.5,73,171,0,0,1,4
270,4,134.0,95.0,2515,14.8,78,216,0,1,0,2
348,4,89.0,62.0,2050,17.3,81,226,0,1,0,0


In [27]:
X_test

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
198,4,91.0,53.0,1795,17.4,76,38,0,1,0,0
396,4,120.0,79.0,2625,18.6,82,36,0,0,1,1
33,6,232.0,100.0,2634,13.0,71,2,0,0,1,2
208,8,318.0,150.0,3940,13.2,76,56,0,0,1,3
93,8,318.0,150.0,4237,14.5,73,52,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...
249,8,260.0,110.0,3365,15.5,78,44,0,0,1,2
225,6,250.0,110.0,3520,16.4,77,13,0,0,1,2
367,4,112.0,88.0,2605,19.6,82,9,0,0,1,1
175,4,90.0,70.0,1937,14.0,75,72,1,0,0,0


In [28]:
#5. 스케일 작업
from sklearn.preprocessing import MinMaxScaler
scaler = ['displacement', 'horsepower', 'weight']
min = MinMaxScaler()
min.fit(X_train[scaler])

X_train[scaler] = min.transform(X_train[scaler])
X_test[scaler] = min.transform(X_test[scaler])

In [29]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
3,8,0.609819,0.581006,0.516019,12.0,70,10,0,0,1,4
18,4,0.074935,0.234637,0.146583,14.5,70,78,0,1,0,1
376,4,0.059432,0.122905,0.116813,18.2,82,149,0,1,0,0
248,4,0.059432,0.078212,0.053020,16.4,78,143,0,1,0,0
177,4,0.121447,0.273743,0.306493,15.0,75,13,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
71,3,0.005168,0.284916,0.203289,13.5,72,153,0,1,0,2
106,8,0.728682,0.748603,0.818259,12.5,73,171,0,0,1,4
270,4,0.170543,0.273743,0.255741,14.8,78,216,0,1,0,2
348,4,0.054264,0.089385,0.123901,17.3,81,226,0,1,0,0


In [31]:
#6. 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [32]:
X_train

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
61,4,0.139535,0.223464,0.173802,16.5,72,131,0,0,1,1
103,8,0.857881,0.581006,0.959456,14.0,73,44,0,0,1,4
302,4,0.095607,0.134078,0.152254,14.9,79,187,0,0,1,0
338,4,0.173127,0.212291,0.248653,15.7,81,188,0,0,1,1
397,4,0.131783,0.201117,0.313864,19.4,82,56,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
95,8,1.000000,1.000000,0.946413,11.0,73,23,0,0,1,4
287,8,0.731266,0.513966,0.664020,13.2,79,159,0,0,1,3
316,6,0.405685,0.245810,0.501276,18.7,80,81,0,0,1,2
273,4,0.131783,0.284916,0.224553,14.9,78,61,0,1,0,2


In [33]:
X_valid

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,name,origin_0,origin_1,origin_2,horsepower_qcut
150,4,0.103359,0.262570,0.220584,15.5,74,212,0,1,0,2
319,4,0.134367,0.162011,0.263397,17.5,80,146,0,1,0,1
7,8,0.961240,0.944134,0.765240,8.5,70,184,0,0,1,4
1,8,0.728682,0.664804,0.589736,11.5,70,30,0,0,1,4
90,8,0.932817,0.849162,0.946697,11.5,73,162,0,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...
288,8,0.645995,0.497207,0.628580,15.2,79,101,0,0,1,3
215,8,0.645995,0.581006,0.607315,14.0,76,93,0,0,1,4
146,4,0.056848,0.162011,0.145166,14.5,74,86,0,0,1,1
143,4,0.074935,0.178771,0.194783,14.5,74,173,1,0,0,1


In [34]:
y_train

61     21.0
103    11.0
302    34.5
338    27.2
397    31.0
       ... 
95     12.0
287    16.5
316    19.1
273    23.9
67     11.0
Name: mpg, Length: 254, dtype: float64

In [35]:
y_valid

150    26.0
319    31.3
7      14.0
1      15.0
90     12.0
       ... 
288    18.2
215    13.0
146    28.0
143    26.0
38     14.0
Name: mpg, Length: 64, dtype: float64