In [5]:
import seaborn as sns
df = sns.load_dataset('penguins')
print(df.head())

df.isna().sum()

#1. 결측치 제거
missing = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
for i in missing:
    df[i] = df[i].fillna(df[i].median())
df['sex'] = df['sex'].fillna('Male')

df.isna().sum()

df['sex'].value_counts()

df.head()

from sklearn.preprocessing import LabelEncoder
label = ['species', 'island', 'sex']
df[label] = df[label].apply(LabelEncoder().fit_transform)

df.head()

#3. 데이터 변환, 더미처리
import pandas as pd
category = ['island', 'sex']
for i in category:
    df[i] = df[i].astype('category')
df = pd.get_dummies(df)

df.dtypes

df.head()

#4. 파생변수
df['body_mass_g_qcut'] = pd.qcut(df['body_mass_g'], 5, labels=False)
df.head()

df['body_mass_g_qcut'].value_counts()

#5. 스케일
#help('sklearn.preprocessing')
from sklearn.preprocessing import MinMaxScaler
scaler = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
min = MinMaxScaler()
min.fit(df[scaler])
df[scaler] = min.transform(df[scaler])

df.head()

#6. 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df['species'], test_size=0.2, stratify=df['species'], random_state=1)

print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

print(df.iloc[2:, 3:])

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  
X_train (275, 10)
X_test (69, 10)
y_train (275,)
y_test (69,)
     flipper_length_mm  body_mass_g  island_0  island_1  island_2  sex_0  \
2             0.389831     0.152778         0         0         1      1   
3             0.423729     0.375000         0         0         1      0   
4             0.355932     0.208333         0         0         1      1   
5             0.305085     0.263889    

In [7]:
#7. 모형 학습
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)

In [8]:
pred1

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [10]:
from sklearn.ensemble import AdaBoostClassifier
model2 = AdaBoostClassifier()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)

In [11]:
pred2

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 1, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [14]:
#8. 앙상블
from sklearn.ensemble import VotingClassifier
clf = VotingClassifier(estimators=[('rf', model1), ('ad', model2)], voting='hard')
clf.fit(X_train, y_train)
pred3 = clf.predict(X_test)

In [15]:
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [17]:
#9. 모형평가
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('랜포 정확도', accuracy_score(y_test, pred1))
print('에이다 정확도', accuracy_score(y_test, pred2))
print('보팅 정확도', accuracy_score(y_test, pred3))

랜포 정확도 1.0
에이다 정확도 0.9855072463768116
보팅 정확도 1.0


In [18]:
#10. 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [50, 100], 'max_depth': [4,6]}
model4 = RandomForestClassifier()
clf = GridSearchCV(estimator=model4, param_grid=parameters, cv=3)
clf.fit(X_train, y_train)
print('최적의 파라미터', clf.best_params_)

최적의 파라미터 {'max_depth': 4, 'n_estimators': 100}


In [19]:
#11. 예측값 저장
pred3

array([0, 1, 1, 0, 0, 1, 1, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 0, 2, 2, 0, 2, 2, 0, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0,
       2, 0, 1])

In [23]:
pd.DataFrame({'id':y_test.index, 'pred':pred3}).to_csv('00300.csv', index=False)

In [25]:
check = pd.read_csv('00300.csv')
check.head()

Unnamed: 0,id,pred
0,57,0
1,173,1
2,213,1
3,50,0
4,25,0
