## 부동산 실거래가 예측 대회

### 목적

### 라이브러리 Import

In [2]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

In [None]:
!pip install eli5==0.13.0

# 한글 폰트 사용을 위한 라이브러리입니다.
!apt-get install -y fonts-nanum

### Stage 1. 데이터 확인

In [1]:
import os

print("Current Working Directory:", os.getcwd())

Current Working Directory: /data/ephemeral/home/ML1_DY/code


1. 데이터 불러오기

In [8]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '../data/train.csv'
test_path  = '../data/test.csv'
subway_feature_path = '../data/subway_feature.csv'
bus_feature_path = '../data/bus_feature.csv'
sample_submission_path = '../data/sample_submission.csv'



dt_train = pd.read_csv(train_path)
dt_test = pd.read_csv(test_path)
dt_subway_feature = pd.read_csv(subway_feature_path)
dt_bus_feature = pd.read_csv(bus_feature_path)
dt_sample_submission = pd.read_csv(sample_submission_path)



In [7]:
train_geo_path = '../data/train_latlon_complete.csv'
dt_train_geo = pd.read_csv(train_geo_path)

In [8]:
dt_train_geo

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,target
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,124000.0
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,123500.0
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,91500.0
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,130000.0
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,117000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988727,서울특별시 마포구 성산동,446,446.0,0.0,성산시영,50.03,200911,8,9,1986,...,0.0,1217.0,의무,2013-05-28 11:22:18.0,Y,N,126.90282,37.568465,2013-03-07 09:46:18.0,38900.0
988728,서울특별시 마포구 성산동,446,446.0,0.0,성산시영,50.03,200911,8,11,1986,...,0.0,1217.0,의무,2013-05-28 11:22:18.0,Y,N,126.90282,37.568465,2013-03-07 09:46:18.0,39000.0
988729,서울특별시 마포구 성산동,446,446.0,0.0,성산시영,50.03,200911,16,6,1986,...,0.0,1217.0,의무,2013-05-28 11:22:18.0,Y,N,126.90282,37.568465,2013-03-07 09:46:18.0,39000.0
988730,서울특별시 마포구 성산동,446,446.0,0.0,성산시영,50.03,200912,21,4,1986,...,0.0,1217.0,의무,2013-05-28 11:22:18.0,Y,N,126.90282,37.568465,2013-03-07 09:46:18.0,38000.0


2. 데이터 확인

In [9]:
# Train data와 Test data shape은 아래와 같습니다.
print('Train data shape : ', dt_train.shape, 'Test data shape : ', dt_test.shape)

Train data shape :  (1118822, 52) Test data shape :  (9272, 51)


In [10]:
dt_train.head()

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,target
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,124000
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,123500
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,91500
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,130000
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,117000


3. 데이터 타입 확인

In [11]:
dt_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118822 entries, 0 to 1118821
Data columns (total 52 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   시군구                     1118822 non-null  object 
 1   번지                      1118597 non-null  object 
 2   본번                      1118747 non-null  float64
 3   부번                      1118747 non-null  float64
 4   아파트명                    1116696 non-null  object 
 5   전용면적(㎡)                 1118822 non-null  float64
 6   계약년월                    1118822 non-null  int64  
 7   계약일                     1118822 non-null  int64  
 8   층                       1118822 non-null  int64  
 9   건축년도                    1118822 non-null  int64  
 10  도로명                     1118822 non-null  object 
 11  해제사유발생일                 5983 non-null     float64
 12  등기신청일자                  1118822 non-null  object 
 13  거래유형                    1118822 non-null  object 
 14  중개

In [16]:
numeric_column_names = dt_train.select_dtypes(include='number').columns
numeric_column_names

Index(['본번', '부번', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도', '해제사유발생일',
       '단지소개기존clob', 'k-전체동수', 'k-전체세대수', 'k-연면적', 'k-주거전용면적', 'k-관리비부과면적',
       'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하',
       'k-135㎡초과', '건축면적', '주차대수', '좌표X', '좌표Y', 'target'],
      dtype='object')

4. 데이터 통계값 확인

- describe() 함수 사용해서 수치형 데이터의 통계값 확인하기

In [17]:
def str_to_int(string):
    if type(string) == str:
        string = string.replace(',', '')
        return int(string)
    else:
        return string

dt_train['target'] = dt_train['target'].apply(str_to_int)

columns = ['전용면적(㎡)', '층', 'target']
dt_train[columns].describe()

Unnamed: 0,전용면적(㎡),층,target
count,1118822.0,1118822.0,1118822.0
mean,77.17475,8.871968,57991.53
std,29.36423,5.982584,46426.02
min,10.02,-4.0,350.0
25%,59.65,4.0,30500.0
50%,81.88,8.0,44800.0
75%,84.96,12.0,69800.0
max,424.32,69.0,1450000.0


5. 테스트 데이터 확인

In [18]:
dt_test.head(10)

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,청소비관리형태,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,202307,26,5,1987,...,직영,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0
1,서울특별시 강남구 개포동,651-1,651.0,1.0,개포더샵트리에,108.2017,202308,15,10,2021,...,위탁,2724.46,305.0,의무,2022-02-23 13:01:10.0,Y,N,127.056394,37.484892,2022-02-23 11:05:05.0
2,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,161.0,202307,28,15,1984,...,위탁,61064.24,419.0,의무,1984-12-22 00:00:00.0,Y,N,127.05599,37.483894,2013-03-07 09:46:28.0
3,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,133.46,202308,10,14,1984,...,위탁,61064.24,419.0,의무,1984-12-22 00:00:00.0,Y,N,127.05599,37.483894,2013-03-07 09:46:28.0
4,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,104.43,202308,18,6,1984,...,위탁,61064.24,419.0,의무,1984-12-22 00:00:00.0,Y,N,127.05599,37.483894,2013-03-07 09:46:28.0
5,서울특별시 강남구 개포동,187,187.0,0.0,개포주공5단지,74.25,202307,28,8,1983,...,위탁,6524.0,652.0,의무,2015-03-02 11:08:24.0,Y,N,127.068028,37.487802,2013-03-07 09:46:42.0
6,서울특별시 강남구 개포동,185,185.0,0.0,개포주공6단지,83.21,202307,2,11,1983,...,위탁,0.0,1100.0,의무,2018-03-19 10:55:24.0,Y,N,127.072682,37.489122,2013-03-07 09:46:38.0
7,서울특별시 강남구 개포동,185,185.0,0.0,개포주공6단지,73.02,202308,14,12,1983,...,위탁,0.0,1100.0,의무,2018-03-19 10:55:24.0,Y,N,127.072682,37.489122,2013-03-07 09:46:38.0
8,서울특별시 강남구 개포동,185,185.0,0.0,개포주공6단지,53.06,202308,24,9,1983,...,위탁,0.0,1100.0,의무,2018-03-19 10:55:24.0,Y,N,127.072682,37.489122,2013-03-07 09:46:38.0
9,서울특별시 강남구 개포동,1280,1280.0,0.0,래미안블레스티지,126.928,202307,3,26,2019,...,위탁,16155.05,3153.0,의무,2019-03-14 11:33:30.0,Y,N,127.064151,37.480049,2019-03-09 05:56:52.0


6. 제출 데이터 확인

In [19]:
dt_sample_submission.head(10)

Unnamed: 0,target
0,179048
1,84820
2,248141
3,180991
4,295430
5,229921
6,92951
7,126950
8,240472
9,280156


7. 타겟의 평균값 구하기

In [20]:
def str_to_int(string):
    if type(string) == str:
        string = string.replace(',', '')
        return int(string)
    else:
        return string

dt_train['target'] = dt_train['target'].apply(str_to_int)

mean_apt_price = round(dt_train['target'].mean())
dt_sample_submission['target'] = mean_apt_price
dt_sample_submission.head()

Unnamed: 0,target
0,57992
1,57992
2,57992
3,57992
4,57992


8. 제출 파일 생성

In [21]:
dt_sample_submission.to_csv('../data/submission_average.csv', index=False)