In [33]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor

# 한글폰트 설정, 그래프 마이너스 표시 설정

import matplotlib
from matplotlib import font_manager, rc
from matplotlib import pyplot as plt
import platform
import seaborn as sns

# 경로 설정

In [38]:
PATH = r'C:\Users\JY\JYC\Projects\parkingLot\data'

age_gender = pd.read_csv(PATH + '\\age_gender_info.csv')
train = pd.read_csv(PATH + '\\train.csv')
test = pd.read_csv(PATH + '\\test.csv')
submission = pd.read_csv(PATH + '\\sample_submission.csv')

# 폰트 설정

In [12]:
if platform.system() == 'Windows':
# 윈도우인 경우
    font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
    rc('font', family=font_name)
else:    
# Mac 인 경우
    rc('font', family='AppleGothic')

matplotlib.rcParams['axes.unicode_minus'] = False

In [13]:
train

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17,A,9216000,82940,0.0,3,624,205
1,C2515,545,아파트,경상남도,국민임대,39.60,60,17,A,12672000,107130,0.0,3,624,205
2,C2515,545,아파트,경상남도,국민임대,39.60,20,17,A,12672000,107130,0.0,3,624,205
3,C2515,545,아파트,경상남도,국민임대,46.90,38,17,A,18433000,149760,0.0,3,624,205
4,C2515,545,아파트,경상남도,국민임대,46.90,19,17,A,18433000,149760,0.0,3,624,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2864,C2532,239,아파트,강원도,국민임대,49.20,19,7,A,11346000,116090,0.0,1,166,146
2865,C2532,239,아파트,강원도,국민임대,51.08,34,7,A,14005000,142310,0.0,1,166,146
2866,C2532,239,아파트,강원도,국민임대,51.73,34,7,A,14005000,142310,0.0,1,166,146
2867,C2532,239,아파트,강원도,국민임대,51.96,114,7,A,14005000,142310,0.0,1,166,146


In [14]:
train.isna().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              0
임대보증금                           569
임대료                             569
도보 10분거리 내 지하철역 수(환승노선 수 반영)    207
도보 10분거리 내 버스정류장 수                0
단지내주차면수                           0
등록차량수                             0
dtype: int64

In [15]:
test.isna().sum()

단지코드                              0
총세대수                              0
임대건물구분                            0
지역                                0
공급유형                              0
전용면적                              0
전용면적별세대수                          0
공가수                               0
자격유형                              2
임대보증금                           180
임대료                             180
도보 10분거리 내 지하철역 수(환승노선 수 반영)     42
도보 10분거리 내 버스정류장 수                0
단지내주차면수                           0
dtype: int64

# 컬럼명 변경

In [16]:
train.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수', '등록차량수'
]

test.columns = [
    '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
    '임대보증금', '임대료', '지하철', '버스',
    '단지내주차면수'
]

# 지역명 숫자로 매핑

In [18]:
local_map = {}
for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i

In [19]:
train['지역'] = train['지역'].map(local_map)
test['지역'] = test['지역'].map(local_map)

# 전용면적 5의 배수로 변경

In [24]:
train['전용면적'] = train['전용면적']//5*5
test['전용면적'] = test['전용면적']//5*5

# 전용면적 상/하한 적용

In [25]:
idx = train[train['전용면적']>100].index
train.loc[idx, '전용면적'] = 100
idx = test[test['전용면적']>100].index
test.loc[idx, '전용면적'] = 100

idx = train[train['전용면적']<15].index
train.loc[idx, '전용면적'] = 15
idx = test[test['전용면적']<15].index
test.loc[idx, '전용면적'] = 15

In [26]:
test['전용면적'].unique()

array([ 35.,  45.,  50.,  30.,  55.,  25.,  75., 100.,  15.,  20.,  40.,
        60.,  80.,  70.])

# 단지별 데이터 1차원으로 취합

In [27]:
columns = ['단지코드', '총세대수', '공가수', '지역', '단지내주차면수', '지하철', '버스']
target = '등록차량수'
area_columns = []
for area in train['전용면적'].unique():
    area_columns.append(f'면적_{area}')

In [28]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

In [29]:
for i, code in tqdm(enumerate(train['단지코드'].unique())):
    temp = train[train['단지코드']==code]
    temp.index = range(temp.shape[0])
    for col in columns:
        new_train.loc[i, col] = temp.loc[0, col]
    
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_train.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()
    
    new_train.loc[i, '등록차량수'] = temp.loc[0, '등록차량수']
    
for i, code in tqdm(enumerate(test['단지코드'].unique())):
    temp = test[test['단지코드']==code]
    temp.index = range(temp.shape[0])
    for col in columns:
        new_test.loc[i, col] = temp.loc[0, col]
    
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_test.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()

411it [00:07, 52.67it/s]
150it [00:02, 51.56it/s]


In [30]:
new_train

Unnamed: 0,단지코드,총세대수,공가수,지역,단지내주차면수,지하철,버스,면적_30.0,면적_35.0,면적_45.0,...,면적_25.0,면적_70.0,면적_15.0,면적_20.0,면적_100.0,면적_60.0,면적_75.0,면적_80.0,면적_65.0,등록차량수
0,C2515,545.0,17.0,0.0,624.0,0.0,3.0,276.0,80.0,57.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205.0
1,C1407,1216.0,13.0,1.0,1285.0,1.0,1.0,390.0,0.0,340.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1064.0
2,C1945,755.0,6.0,2.0,734.0,1.0,3.0,0.0,240.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730.0
3,C1470,696.0,14.0,3.0,645.0,0.0,2.0,0.0,254.0,196.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,553.0
4,C1898,566.0,9.0,3.0,517.0,0.0,6.0,0.0,271.0,209.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,415.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,C2586,90.0,7.0,8.0,66.0,0.0,3.0,0.0,36.0,0.0,...,42.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,57.0
407,C2035,492.0,24.0,4.0,521.0,0.0,1.0,0.0,156.0,180.0,...,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.0
408,C2020,40.0,7.0,7.0,25.0,1.0,2.0,0.0,15.0,0.0,...,5.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0
409,C2437,90.0,12.0,10.0,30.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,90.0,0.0,0.0,0.0,0.0,0.0,16.0


# 결측치 처리

In [31]:
new_train = new_train.fillna(-1)
new_test = new_test.fillna(-1)

# 학습

In [32]:
x_train = new_train.iloc[:, 1:-1]
y_train = new_train.iloc[:,-1]
x_test = new_test.iloc[:,1:]

In [34]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

In [35]:
model.fit(x_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

# 추론 및 제출

In [36]:
pred = model.predict(x_test)

In [39]:
submission['num'] = pred

In [40]:
submission.to_csv('baseline.csv', index=False)