In [1]:
# 데이터 파일 읽기 예제
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = sns.load_dataset('mpg')
X_train, X_test, y_train, y_test = train_test_split(df, df['mpg'], test_size=0.2, random_state=42)
X_train = X_train.drop(['mpg'], axis=1)
X_test = X_test.drop(['mpg'], axis=1)

# 사용자 코딩
#1. 결측치 제거
X_train['horsepower'] = X_train['horsepower'].fillna(X_train['horsepower'].median())
X_test['horsepower'] = X_test['horsepower'].fillna(X_test['horsepower'].median())

#2. 라벨인코더
label = ['origin', 'name']
from sklearn.preprocessing import LabelEncoder
X_train[label] = X_train[label].apply(LabelEncoder().fit_transform)
X_test[label] = X_test[label].apply(LabelEncoder().fit_transform)

#3. 카테고리 변환, 더미처리
category = ['origin']
for i in category:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

#4. 파생변수 만들기
X_train['horsepower_qcut'] = pd.qcut(X_train['horsepower'], 5, labels=False)
X_test['horsepower_qcut'] = pd.qcut(X_test['horsepower'], 5, labels=False)

#5. 스케일 작업
from sklearn.preprocessing import MinMaxScaler
scaler = ['displacement', 'horsepower', 'weight']
min = MinMaxScaler()
min.fit(X_train[scaler])

X_train[scaler] = min.transform(X_train[scaler])
X_test[scaler] = min.transform(X_test[scaler])

#6. 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [2]:
#7. 모형 학습
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train, y_train)
pred1 = model1.predict(X_valid)

from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_valid)

In [3]:
#8. 앙상블(스태킹)
from sklearn.ensemble import StackingRegressor
estimators = [('lr', model1), ('rf', model2)]
model3 = StackingRegressor(estimators, final_estimator=RandomForestRegressor())
model3.fit(X_train, y_train)
pred3 = model3.predict(X_valid)


In [5]:
#9. 모형평가
from sklearn.metrics import mean_squared_error
mean_squared_error(y_valid, pred1)

12.966610337470009

In [9]:
np.sqrt

<ufunc 'sqrt'>

In [11]:
#9. 모형평가
from sklearn.metrics import mean_squared_error

print('선형회귀 RMSE', np.sqrt(mean_squared_error(y_valid, pred1)))
print('랜포 RMSE', np.sqrt(mean_squared_error(y_valid, pred2)))
print('스패킹 RMSE', np.sqrt(mean_squared_error(y_valid, pred3)))

선형회귀 RMSE 3.600917985385117
랜포 RMSE 3.0405751100737497
스패킹 RMSE 3.529296632298


In [13]:
#10. 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[50,100], 'max_depth':[4,6]}
model4 = RandomForestRegressor()
clf = GridSearchCV(estimator=model4, param_grid=parameters, cv=3)
clf.fit(X_train, y_train)

print('최적의파라미터', clf.best_params_)

최적의파라미터 {'max_depth': 6, 'n_estimators': 100}


In [14]:
#11. 파일저장
result = pd.DataFrame(model2.predict(X_test))
result = result.iloc[:,0]
pd.DataFrame({'id':X_test.index, 'result':result}).to_csv('00400.csv', index=False)
check = pd.read_csv('00400.csv')

print(check)

     id  result
0   198  30.489
1   396  29.992
2    33  21.355
3   208  15.525
4    93  14.450
..  ...     ...
75  249  20.171
76  225  18.904
77  367  28.286
78  175  29.249
79  285  16.822

[80 rows x 2 columns]
