### 데이터 설명
* id : 샘플 아이디
* Species: 펭귄의 종을 나타내는 문자열
* Island : 샘플들이 수집된 Palmer Station 근처 섬 이름
* Clutch Completion : 관찰된 펭귄 둥지의 알이 2개인 경우 Full Clutch이며 Yes로 표기
* Culmen Length (mm) : 펭귄 옆모습 기준 부리의 가로 길이
* Culmen Depth (mm) : 펭귄 옆모습 기준 부리의 세로 길이
* Flipper Length (mm) : 펭귄의 팔(날개) 길이
* Sex : 펭귄의 성별
* Delta 15 N (o/oo)  : 토양에 따라 변화하는 안정 동위원소 15N:14N의 비율
* Delta 13 C (o/oo) : 먹이에 따라 변화하는 안정 동위원소 13C:12C의 비율
* Body Mass (g): 펭귄의 몸무게를 나타내는 숫자 (g)

## 1. 데이터 확인

In [1]:
import pandas as pd

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
sub = pd.read_csv('sample_submission.csv')

In [2]:
test.head()

Unnamed: 0,id,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,0,Chinstrap penguin (Pygoscelis antarctica),Dream,Yes,52.0,20.7,210.0,MALE,9.43146,-24.6844
1,1,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,55.9,17.0,228.0,MALE,8.3118,-26.35425
2,2,Adelie Penguin (Pygoscelis adeliae),Dream,Yes,38.9,18.8,190.0,FEMALE,8.36936,-26.11199
3,3,Chinstrap penguin (Pygoscelis antarctica),Dream,Yes,45.2,16.6,191.0,FEMALE,9.62357,-24.78984
4,4,Adelie Penguin (Pygoscelis adeliae),Biscoe,No,37.9,18.6,172.0,FEMALE,8.38404,-25.19837


In [3]:
train.head()

Unnamed: 0,id,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
0,0,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,50.0,15.3,220,MALE,8.30515,-25.19017,5550
1,1,Chinstrap penguin (Pygoscelis antarctica),Dream,No,49.5,19.0,200,MALE,9.63074,-24.34684,3800
2,2,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,45.1,14.4,210,FEMALE,8.51951,-27.01854,4400
3,3,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,44.5,14.7,214,FEMALE,8.20106,-26.16524,4850
4,4,Gentoo penguin (Pygoscelis papua),Biscoe,No,49.6,16.0,225,MALE,8.38324,-26.84272,5700


In [4]:
sub.head()

Unnamed: 0,id,Body Mass (g)
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [5]:
train.describe()

Unnamed: 0,id,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
count,114.0,114.0,114.0,114.0,111.0,111.0,114.0
mean,56.5,44.613158,17.014912,203.052632,8.737634,-25.723051,4327.850877
std,33.052988,5.321829,1.941363,14.653425,0.567698,0.859786,781.766484
min,0.0,33.5,13.2,174.0,7.6322,-27.01854,2700.0
25%,28.25,40.325,15.225,190.0,8.272585,-26.434025,3675.0
50%,56.5,45.2,17.25,199.0,8.63259,-25.95541,4250.0
75%,84.75,49.075,18.6,216.0,9.264635,-25.005945,4850.0
max,113.0,55.1,21.1,231.0,10.02544,-24.10255,6300.0


In [6]:
# 몸무게에 영향을 주는 요인 확인 : 상관계수 분석
train.corr()

# 날개 길이 (0.86), 부리 가로길이 (0.57), 토양(-0.54)

Unnamed: 0,id,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
id,1.0,0.078787,-0.144788,0.077909,-0.035484,0.024416,0.089186
Culmen Length (mm),0.078787,1.0,-0.251255,0.672205,-0.040205,0.114983,0.572063
Culmen Depth (mm),-0.144788,-0.251255,1.0,-0.603341,0.650629,0.477688,-0.490643
Flipper Length (mm),0.077909,0.672205,-0.603341,1.0,-0.533212,-0.454315,0.864814
Delta 15 N (o/oo),-0.035484,-0.040205,0.650629,-0.533212,1.0,0.641107,-0.548678
Delta 13 C (o/oo),0.024416,0.114983,0.477688,-0.454315,0.641107,1.0,-0.468425
Body Mass (g),0.089186,0.572063,-0.490643,0.864814,-0.548678,-0.468425,1.0


## 2. 결측치 확인
따라하기에서 사용된 함수 참고

In [7]:
def check_missing_col(df):
    missing_col = []
    cnt_missing_col = 0
    for i, col in enumerate(df.columns):
        missing_values = sum(df[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            cnt_missing_col += 1
            print(f'{col} 컬럼에 {missing_values} 개의 결측치 존재')
            missing_col.append([col, df[col].dtype])
    if cnt_missing_col == 0:
        print('결측치 존재 x')
    return missing_col
                  
missing_col = check_missing_col(train)

Sex 컬럼에 3 개의 결측치 존재
Delta 15 N (o/oo) 컬럼에 3 개의 결측치 존재
Delta 13 C (o/oo) 컬럼에 3 개의 결측치 존재


### 3. 결측치 처리
따라하기에 사용된 함수 참고 

In [8]:
train[train.isna().sum(axis=1) > 0]

Unnamed: 0,id,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Body Mass (g)
6,6,Adelie Penguin (Pygoscelis adeliae),Torgersen,Yes,42.0,20.2,190,,9.13362,-25.09368,4250
8,8,Adelie Penguin (Pygoscelis adeliae),Torgersen,Yes,34.1,18.1,193,,,,3475
18,18,Adelie Penguin (Pygoscelis adeliae),Dream,No,39.8,19.1,184,MALE,,,4650
70,70,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,46.2,14.4,214,,8.24253,-26.8154,4650
109,109,Adelie Penguin (Pygoscelis adeliae),Torgersen,Yes,36.6,17.8,185,FEMALE,,,3700


In [9]:
def handle_na(data, missing_col):
    temp = data.copy()
    for col, dtype in missing_col:
        if dtype == 'O':    # 데이터 타입이 object 일 때
            temp = temp.dropna(subset = [col])
        elif dtype == int or dtype == float:
            temp.loc[:,col] = temp[col].fillna(temp[col].mean())
    return temp

data = handle_na(train, missing_col)

missing_col = check_missing_col(train)

Sex 컬럼에 3 개의 결측치 존재
Delta 15 N (o/oo) 컬럼에 3 개의 결측치 존재
Delta 13 C (o/oo) 컬럼에 3 개의 결측치 존재


## 4. 데이터 설명 방정식 만들기

* case 1 : Y = aX + b
* case 2 : Y = aX1 + bX2 + c
* case 3 : Y = aX1 + bX2 + cX3 + d

### \- case 1 진행

In [10]:
X = train['Flipper Length (mm)']
Y = train['Body Mass (g)']

import numpy as np
mean_X = np.mean(X)
mean_Y = np.mean(Y)

a = mean_Y/mean_X
a

# Y = 21.3139 * X + b 식을 세울 수 있다 

21.31393640919302

In [11]:
b = mean_Y - 21.3139 * mean_X
b

# Y = 21.3139 * X + 0.0074 식 도출

0.007392982456622121

In [13]:
predict_Y = 21.3139 * X + 0.0074

## 8. 예측 결과 평가 (case1)

In [15]:
def RMSE(true, pred):
    score = np.sqrt(np.mean(np.square(true-predict_Y)))
    return score

real_answer = Y.copy()

error = RMSE(real_answer, predict_Y)
print(error)

532.8047155506056


## 9. test_data 예측

In [18]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,0,Chinstrap penguin (Pygoscelis antarctica),Dream,Yes,52.0,20.7,210.0,MALE,9.43146,-24.6844
1,1,Gentoo penguin (Pygoscelis papua),Biscoe,Yes,55.9,17.0,228.0,MALE,8.3118,-26.35425
2,2,Adelie Penguin (Pygoscelis adeliae),Dream,Yes,38.9,18.8,190.0,FEMALE,8.36936,-26.11199
3,3,Chinstrap penguin (Pygoscelis antarctica),Dream,Yes,45.2,16.6,191.0,FEMALE,9.62357,-24.78984
4,4,Adelie Penguin (Pygoscelis adeliae),Biscoe,No,37.9,18.6,172.0,FEMALE,8.38404,-25.19837


In [19]:
test_X = test['Flipper Length (mm)']

predict_test = 21.3139 * test_X + 0.0074

predict_test

0      4475.9264
1      4859.5766
2      4049.6484
3      4070.9623
4      3665.9982
         ...    
223    4326.7291
224    4625.1237
225    4475.9264
226    4284.1013
227    4134.9040
Name: Flipper Length (mm), Length: 228, dtype: float64

## 10. 제출용 파일 저장

In [21]:
submission = pd.read_csv('sample_submission.csv')
submission.head()

Unnamed: 0,id,Body Mass (g)
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [23]:
submission['Body Mass (g)'] = predict_test
submission

Unnamed: 0,id,Body Mass (g)
0,0,4475.9264
1,1,4859.5766
2,2,4049.6484
3,3,4070.9623
4,4,3665.9982
...,...,...
223,223,4326.7291
224,224,4625.1237
225,225,4475.9264
226,226,4284.1013


In [27]:
submission.to_csv('submission2.csv', index = False)

In [28]:
read = pd.read_csv('submission2.csv')
read

Unnamed: 0,id,Body Mass (g)
0,0,4475.9264
1,1,4859.5766
2,2,4049.6484
3,3,4070.9623
4,4,3665.9982
...,...,...
223,223,4326.7291
224,224,4625.1237
225,225,4475.9264
226,226,4284.1013
