# 1. 데이터 다운로드

# 2. 간단한 데이터 확인 및 정리

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('iris_train.csv') # iris_train 데이터 불러오기
train.head()

Unnamed: 0,id,species,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
0,0,setosa,4.4,1.4,2.9,0.2
1,1,versicolor,6.4,4.5,3.2,1.5
2,2,virginica,6.2,4.8,2.8,1.8
3,3,virginica,7.2,6.1,3.6,2.5
4,4,setosa,4.9,1.4,3.0,0.2


In [3]:
train.describe() 
# describe를 통해 각각 feature의 수, 평균, 표준편차 등을 간단히 확인할 수 있다

Unnamed: 0,id,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
count,75.0,75.0,75.0,75.0,75.0
mean,37.0,6.044,4.258667,2.958667,1.384
std,21.794495,0.842653,1.61539,0.388376,0.697106
min,0.0,4.3,1.1,2.0,0.1
25%,18.5,5.55,3.75,2.8,1.0
50%,37.0,6.1,4.7,3.0,1.5
75%,55.5,6.6,5.3,3.2,1.85
max,74.0,7.7,6.9,4.0,2.5


In [4]:
train['species'] = train['species'].map({'setosa':0,
                                        'versicolor':1,
                                        'virginica':2
                                        })
# EDA를 통해 붓꽃의 종류에 따라 길이, 너비의 차이가 있다는 사실을 알기 때문에
# 이를 학습에 사용하기 위해 각각 0, 1, 2 숫자로 바꾸어주었습니다.
train

Unnamed: 0,id,species,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
0,0,0,4.4,1.4,2.9,0.2
1,1,1,6.4,4.5,3.2,1.5
2,2,2,6.2,4.8,2.8,1.8
3,3,2,7.2,6.1,3.6,2.5
4,4,0,4.9,1.4,3.0,0.2
...,...,...,...,...,...,...
70,70,1,6.5,4.6,2.8,1.5
71,71,1,5.6,3.6,2.9,1.3
72,72,1,6.2,4.5,2.2,1.5
73,73,1,4.9,3.3,2.4,1.0


In [5]:
test = pd.read_csv('iris_test.csv') # 테스트 데이터 불러오기
test.head()

Unnamed: 0,id,species,sepal length (cm),petal length (cm)
0,0,setosa,5.4,1.7
1,1,setosa,5.7,1.5
2,2,setosa,5.3,1.5
3,3,setosa,5.1,1.9
4,4,virginica,6.0,4.8


In [6]:
test.describe()

Unnamed: 0,id,sepal length (cm),petal length (cm)
count,75.0,75.0,75.0
mean,37.0,5.642667,3.257333
std,21.794495,0.767441,1.776816
min,0.0,4.4,1.0
25%,18.5,5.1,1.5
50%,37.0,5.5,3.7
75%,55.5,6.15,5.05
max,74.0,7.9,6.4


In [7]:
test['species'] = test['species'].map({'setosa':0,
                                      'versicolor':1,
                                      'virginica':2
                                      })
# 학습 데이터와 마찬가지로 species를 숫자로 바꾸어주기
test

Unnamed: 0,id,species,sepal length (cm),petal length (cm)
0,0,0,5.4,1.7
1,1,0,5.7,1.5
2,2,0,5.3,1.5
3,3,0,5.1,1.9
4,4,2,6.0,4.8
...,...,...,...,...
70,70,1,5.8,4.0
71,71,0,4.4,1.3
72,72,2,6.3,5.6
73,73,1,5.1,3.0


# 1. LazyRegressor를 통한 여러 Regression 결과 확인하기 

In [8]:
# !pip install lazypredict

In [9]:
# import os # 중간에 pandas, numpy가 오류가 난다면 이 코드를 실행하고 처음부터 코드들 다시 실행하기
# os.kill(os.getpid(), 9)

In [13]:
from lazypredict.Supervised import LazyRegressor
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np
from sklearn.model_selection import train_test_split

X = train[['species', 'sepal length (cm)', 'petal length (cm)']] # 학습 데이터에서 학습에 사용되는 feature만 가져오기
Y = train[['sepal width (cm)', 'petal width (cm)']] # 학습 데이터에서 예측에 사용되는 target값들 가져오기

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42) # train, test 데이터 나누기

In [14]:
def MAE(true, pred): # 우리가 사용할 평가지표 MAE 정의하기
    score = np.mean(np.abs(true-pred))
    return score

reg = LazyRegressor(verbose = 0, ignore_warnings = True, custom_metric=MAE) 
# LazyRegressor 모델을 생성하고, MAE를 custom_metric에 넣어주기
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

100%|██████████| 42/42 [00:00<00:00, 54.32it/s]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken,MAE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TransformedTargetRegressor,0.61,0.68,0.23,0.02,sepal width (cm) 0.22 petal width (cm) 0.1...
Lars,0.61,0.68,0.23,0.02,sepal width (cm) 0.22 petal width (cm) 0.1...
LinearRegression,0.61,0.68,0.23,0.01,sepal width (cm) 0.22 petal width (cm) 0.1...
RidgeCV,0.61,0.68,0.23,0.01,sepal width (cm) 0.22 petal width (cm) 0.1...
Ridge,0.6,0.67,0.23,0.02,sepal width (cm) 0.22 petal width (cm) 0.1...
KNeighborsRegressor,0.56,0.63,0.24,0.01,sepal width (cm) 0.23 petal width (cm) 0.1...
RANSACRegressor,0.55,0.63,0.24,0.02,sepal width (cm) 0.23 petal width (cm) 0.1...
OrthogonalMatchingPursuit,0.39,0.49,0.29,0.02,sepal width (cm) 0.25 petal width (cm) 0.1...
RandomForestRegressor,0.36,0.47,0.29,0.11,sepal width (cm) 0.27 petal width (cm) 0.1...
BaggingRegressor,0.29,0.41,0.3,0.05,sepal width (cm) 0.29 petal width (cm) 0.1...


In [17]:
# 가장 좋은 모델을 구글링해서 사용법 찾아보기
# 코드 작성자는 RANSCACRegressor의 RMSE와 MAE가 가장 낮아 선택하였다.
from sklearn.linear_model import RANSACRegressor

test = test[['species','sepal length (cm)','petal length (cm)']] # test 데이터에서 학습에 이용할 컬럼만 가져오기
reg = RANSACRegressor(random_state=0).fit(X,Y) # regressor 생성
reg.score(X, Y)

predict = reg.predict(test)

predict

array([[3.45350001, 0.2786458 ],
       [3.72760059, 0.23780715],
       [3.48679248, 0.22665065],
       [3.17939943, 0.31948445],
       [2.97092834, 1.89849883],
       [2.71028254, 1.96672963],
       [3.54699451, 0.22943977],
       [3.35293365, 0.19368026],
       [3.24598437, 0.21549415],
       [2.80847964, 1.40447736],
       [3.29273162, 0.19089114],
       [3.23252959, 0.18810202],
       [3.01129267, 1.98067525],
       [3.36681087, 2.3451404 ],
       [2.70791328, 1.31951182],
       [3.05803992, 1.95607223],
       [3.11212554, 0.18252377],
       [3.37250484, 1.99740999],
       [2.82193442, 1.43186949],
       [2.65715237, 2.09811206],
       [2.83775845, 2.10647944],
       [2.99879335, 0.31111708],
       [2.81486256, 1.29769793],
       [3.31964118, 0.24567542],
       [2.8814475 , 1.19370763],
       [3.13903509, 0.23730804],
       [3.27289393, 0.27027843],
       [3.4733377 , 0.19925851],
       [3.64048901, 0.18023375],
       [3.23226309, 2.07121903],
       [2.

In [18]:
df = pd.DataFrame(predict)
df

Unnamed: 0,0,1
0,3.45,0.28
1,3.73,0.24
2,3.49,0.23
3,3.18,0.32
4,2.97,1.90
...,...,...
70,2.92,1.28
71,3.04,0.15
72,2.78,2.10
73,2.97,1.01


In [20]:
submission = pd.read_csv('sample_submission.csv')
submission.head()

submission['sepal width (cm)'] = np.array(df[0], np.float32)
submission['petal width (cm)'] = np.array(df[1], np.float32)

submission # 예측 데이터 확인

Unnamed: 0,id,sepal width (cm),petal width (cm)
0,0,3.45,0.28
1,1,3.73,0.24
2,2,3.49,0.23
3,3,3.18,0.32
4,4,2.97,1.90
...,...,...,...
70,70,2.92,1.28
71,71,3.04,0.15
72,72,2.78,2.10
73,73,2.97,1.01


In [21]:
submission.to_csv('submission_hello.csv', index=False) # 제출 파일 저장