In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='Malgun Gothic')

from scipy import stats
from tqdm import tqdm

In [2]:
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)

In [3]:
PATH = r'C:\Users\JY\JYC\Projects\parkingLot\data'

rawtrain = pd.read_csv(PATH + '\\train.csv')
rawtest = pd.read_csv(PATH + '\\test.csv')
age_gender = pd.read_csv(PATH + '\\age_gender_info.csv', index_col=0)
sample_submission = pd.read_csv(PATH + '\\sample_submission.csv')
car_count = pd.read_csv(PATH + '\\car_count.csv', index_col = 0)

###
### 지역정보 join (운전가능연령 : 20~70대) : 여자운전비율과 남자운전비율을 계산하여 삽입

In [12]:
female = []
male = []
for col in age_gender.columns:
    female.append(col) if '여자' in col else male.append(col)

female_ratio = pd.DataFrame(age_gender[female].iloc[:,2:8].sum(axis=1), columns=['여자운전비율'])
male_ratio = pd.DataFrame(age_gender[male].iloc[:, 2:8].sum(axis=1), columns=['남자운전비율'])

rawtrain = pd.merge(rawtrain, female_ratio, on ='지역')
rawtrain = pd.merge(rawtrain, male_ratio, on = '지역')
rawtest = pd.merge(rawtest, female_ratio, on='지역')
rawtest = pd.merge(rawtest, male_ratio, on= '지역')

In [17]:
# rawtrain[rawtrain['임대보증금'] == '-']
_t1 = rawtrain.groupby('단지코드').sum()['전용면적별세대수']

pandas.core.series.Series

In [29]:
def exception():
    # 1. 전용면적별 세대수 합계 join
    _t1 = rawtrain.groupby('단지코드').sum()['전용면적별세대수'] ### 이건 무슨 과정인가요?
    _t2 = pd.merge(rawtrain, _t1, on = '단지코드')
    _t2['전용면적별세대수'] = _t2['전용면적별세대수_x']
    _t2['총세대수'] = _t2['전용면적별세대수_y'] 
    _final1 = _t2.drop(['전용면적별세대수_x', '전용면적별세대수_y'], axis = 'columns')

    # 2. 제외할 코드들 제외
    train = _final1[~_final1['단지코드'].isin(['C2085', 'C1397', 'C2431', 'C1649', 'C1036', 'C1095', 
                                             'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988'])]
    test = rawtest[~rawtest['단지코드'].isin(['C2675', 'C2335', 'C1327'])] 

    return train, test                                            

In [19]:
train, test = exception()

###
### 결측치 처리

In [40]:
def nanprocess_v1(usetrain):

    """
        도보 10분거리내 지하철역 수: "지역별로" 결측치 제외 평균 0.5 미만이면 0, 이상이면 1 부여
        cf. 대전과 충청남도에만 결측치 있음    
        
    """
    cities = list(set(usetrain['지역']))
    aparts = list(set(usetrain['단지코드']))
    _pre1 = usetrain.copy()
    
    _col = '도보 10분거리 내 지하철역 수(환승노선 수 반영)'
    for city in tqdm(cities):
        _aparts_in_city = usetrain[usetrain['지역'] == city]
        _codes_in_city = list(set(_aparts_in_city['단지코드']))
        
        _mean = np.nanmean(_aparts_in_city[_col])
        for code in _codes_in_city:
            _idx = _pre1[_pre1['단지코드']==code].index
            if usetrain[usetrain['단지코드'].isin(_codes_in_city)][_col].isnull().sum() > 0:
                if _mean >= 0.5:
                    _pre1.loc[_idx, _col] = 1
                else:
                    _pre1.loc[_idx, _col] = 0
                    
    """
        임대보증금 및 임대료: 결측치 제외하고, 단지별로, 전용면적과 선형회귀분석 진행하여 결측치 처리
        cf. 임대보증금, 임대료에 '-' 값 있음: 결측치로 처리 
        cf. C2152 아파트의 경우 임대보증금/임대료 데이터가 아예 결측 -> 강원도평균으로 처리
        
    """        
    # 임대가치 지표 신규 설정
    _pre2 = _pre1.copy()
    _col = ['임대보증금', '임대료']

    for col in _col:
        _pre2 = _pre2.drop(_pre2[_pre2[col] == '-'].index)
        _pre2[col] = _pre2[col].astype(float)
    _pre2['임대가치'] = _pre2['임대보증금'] * _pre2['임대료']
    
    return _pre1, _pre2

In [41]:
_, prep_train = nanprocess_v1(train)
_forC2152, prep_test = nanprocess_v1(test)

# 강원도 임대가치 평균으로 C2152 결측치 처리
_forC2152['임대가치'] = np.mean(prep_test[prep_test['지역']=='강원도']['임대가치'])
prep_test = pd.concat([prep_test, _forC2152[_forC2152['단지코드'] == 'C2152']])

# 테스트 결측데이터 처리
prep_test.loc[400, '자격유형'] = 'A'
prep_test.loc[599, '자격유형'] = 'C'

prep = pd.concat([prep_train, prep_test])

100%|██████████| 16/16 [00:00<00:00, 19.95it/s]
100%|██████████| 15/15 [00:00<00:00, 53.00it/s]


###
### string 데이터 처리 / 상가 데이터 처리 (one-hot enc)

In [42]:
# 지역
local_map = {}
for i, loc in enumerate(prep['지역'].unique()):
    _arr = [0] * len(prep['지역'].unique())
    _arr[i] = 1
    local_map[loc] = _arr
    
# 공급유형
supply_map = {}
for i, loc in enumerate(prep['공급유형'].unique()):
    _arr = [0] * len(prep['공급유형'].unique())
    _arr[i] = 1
    supply_map[loc] = _arr
    
# 자격유형
qual_map = {}
for i, loc in enumerate(prep['자격유형'].unique()):
    _arr = [0] * len(prep['자격유형'].unique())
    _arr[i] = 1
    qual_map[loc] = _arr

In [43]:
def preprocess_v1(prep, type='train'):
    aparts = list(set(prep['단지코드']))
    merge_set = []
    for code in tqdm(aparts):
        final_vector = {}

        usedat = prep[prep['단지코드'] == code]
        onlyapart = usedat[usedat['임대건물구분'] == '아파트']
        
        if '상가' in set(usedat['임대건물구분']):
            sanga = 1
            sangadat = usedat[usedat['임대건물구분'] == '상가']
            apartdat = usedat[usedat['임대건물구분'] == '아파트']
            sanga_area = sum(sangadat['전용면적'] * sangadat['전용면적별세대수'])
            apart_area = sum(apartdat['전용면적'] * apartdat['전용면적별세대수'])
        else:
            sanga = 0
            sanga_area = 0.0
            apart_area = sum(usedat['전용면적'] * usedat['전용면적별세대수'])
        
        final_vector['단지코드'] = [usedat['단지코드'].iloc[0]]
        final_vector['총세대수'] = [usedat['총세대수'].iloc[0]]
        final_vector['상가'] = [sanga]
        final_vector['아파트면적'] = [apart_area]
        final_vector['상가면적'] = [sanga_area]
        
        _onehot = sum([np.array(local_map[key]) for key in usedat['지역'].unique()])    # 지역정보
        for tp in zip(list(local_map.keys()), list(_onehot)):
            final_vector[tp[0]] = tp[1]
            
        _onehot = sum([np.array(supply_map[key]) * usedat.iloc[idx]['전용면적별세대수'] for idx, key in enumerate(usedat['공급유형'])])    # 공급유형
        for tp in zip(supply_map.keys(), _onehot):
            final_vector[tp[0]] = tp[1]
            
        _onehot = sum([np.array(qual_map[key]) * usedat.iloc[idx]['전용면적별세대수'] for idx, key in enumerate(usedat['자격유형'])])    # 자격유형
        for tp in zip(qual_map.keys(), _onehot):
            final_vector[tp[0]] = tp[1]     

        final_vector['공가수'] = [usedat['공가수'].iloc[0]]            
        final_vector['임대가치'] = [usedat['임대가치'].iloc[0]]
        final_vector['지하철'] = [usedat['도보 10분거리 내 지하철역 수(환승노선 수 반영)'].iloc[0]]
        final_vector['버스'] = [usedat['도보 10분거리 내 버스정류장 수'].iloc[0]]
        final_vector['주차면수'] = [usedat['단지내주차면수'].iloc[0]]
        if type == 'train':
            final_vector['등록차량수'] = [usedat['등록차량수'].iloc[0]]
        
        del final_vector['공공분양']
        
        merge_set.append(pd.DataFrame(final_vector))
    
    return pd.concat(merge_set)

finaltrain = preprocess_v1(prep_train).dropna()
finaltest = preprocess_v1(prep_test, 'test')

100%|██████████| 410/410 [00:04<00:00, 101.66it/s]
100%|██████████| 147/147 [00:01<00:00, 106.60it/s]


In [44]:
finaltrain.head()

Unnamed: 0,단지코드,총세대수,상가,아파트면적,상가면적,경상남도,대전광역시,경기도,전라북도,강원도,광주광역시,충청남도,부산광역시,제주특별자치도,울산광역시,충청북도,전라남도,경상북도,대구광역시,서울특별시,세종특별자치시,국민임대,영구임대,임대상가,공공임대(50년),공공임대(10년),행복주택,공공임대(분납),공공임대(5년),A,C,D,E,H,I,L,K,J,B,G,N,M,O,F,공가수,임대가치,지하철,버스,주차면수,등록차량수
0,C1268,1035,0,46633.62,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1035,0,0,0,0,0,0,0,1035,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,2163645000000.0,0.0,16,911,934
0,C2536,72,0,2722.32,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,72,0,0,0,0,0,0,0,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,457329600000.0,0.0,1,54,47
0,C1866,338,0,13544.1,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,338,0,0,0,0,0,0,0,338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,678538100000.0,0.0,3,235,135
0,C2289,1527,1,42397.74,1427.03,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1507,20,0,0,0,0,0,0,1507,20,0,0,0,0,0,0,0,0,0,0,0,0,2,462844300000.0,0.0,3,240,364
0,C2437,90,0,2234.7,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,90,0,0,0,0,0,0,0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,12,1112505000000.0,0.0,1,30,16


In [45]:
finaltrain[finaltrain['임대가치']<5e12].shape

(357, 50)

###
### Normalization

In [47]:
means = {}
stds = {}
for col in finaltrain.columns.difference(['단지코드']):
    means[col] = np.mean(finaltrain[col])
    stds[col] = np.std(finaltrain[col])
    finaltrain[col] = (finaltrain[col] - means[col]) / stds[col]

    
for col in finaltest.columns.difference(['단지코드']):

    finaltest[col] = (finaltest[col] - means[col]) / stds[col]

###
### 학습시작

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as MAE

import time

In [59]:
import pycaret
from pycaret.classification import *

ModuleNotFoundError: No module named 'scikitplot'

In [51]:
train_validation = {}
except_val = []

tmp_finaltrain = np.array(finaltrain[finaltrain.columns.difference(except_val)])
np.random.seed(777)
np.random.shuffle(np.array(tmp_finaltrain))
thres = int(tmp_finaltrain.shape[0] / 5)

x_train = pd.DataFrame(tmp_finaltrain[:-thres], columns=finaltrain.columns.difference(except_val))
x_validation = pd.DataFrame(tmp_finaltrain[-thres:], columns=finaltrain.columns.difference(except_val))

X = x_train[x_train.columns.difference(['등록차량수', '단지코드'])].astype(float)
y = x_train[['등록차량수']].astype(float)

def myLR():
    model = LinearRegression()
    fit = model.fit(X, y)
    print(f'Linear Regression Score: {MAE(y, fit.predict(X))}')

In [52]:
def myRegressor(regressor, param_grid):
    start = time.time()
    reg_grid = GridSearchCV(estimator=regressor,
                            param_grid=param_grid,
                            scoring='neg_mean_absolute_error',
                            n_jobs=60,
                            cv=5,
                            refit=True,
                            return_train_score=True)
    reg_grid.fit(X, y)
    result = pd.DataFrame(reg_grid.cv_results_)[
        ['params', 'mean_test_score', 'rank_test_score']
    ].sort_values(by='rank_test_score')
    print(f'소요시간: {round((time.time() - start) / 60, 2)}분')
    
    return result

In [53]:
def finalxgb(n_jobs, params):
    start = time.time()
    x_train, y_train = X.copy(), y.copy()
    xgb_clf = xgb.XGBRegressor()
    xgb_param_grid  = {
        'learning_rate': params['learning_rate'],
        'n_estimators': params['n_estimators'],
        'max_depth': [5],
        'min_child_weight': [1],
        'gamma': [0],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'nthread': [-1],
        'scale_pos_weight': [1],
        'tree_method': ['gpu_hist'],
        'gpu_id': params['gpu_id'],
        'seed': [123]
    }

    def GridXGB(xgb_clf, xgb_param_grid):
        hr_grid = GridSearchCV(estimator=xgb_clf,
                           param_grid=xgb_param_grid,
                           scoring='neg_mean_absolute_error',
                           n_jobs=n_jobs,
                           cv=5,
                           refit=True,
                           return_train_score=True)
        hr_grid.fit(x_train, y_train)
        return hr_grid

    hr_grid = GridXGB(xgb_clf, xgb_param_grid)
    hr_grid_df = pd.DataFrame(hr_grid.cv_results_).sort_values(by='rank_test_score')
    print(f'Best MAE: {hr_grid_df.iloc[0]["mean_test_score"]}')

    s1 = xgb_clf
    s1.set_params(**hr_grid.best_params_)
    print(f'Learning Rate, n_estimators FINISHED !! Time Spent: {round((time.time() - start) / 60, 2)} mins')
    print(hr_grid.best_params_)

    xgb_param_grid  = {
        'max_depth': params['max_depth'],
        'min_child_weight': [1, 2, 3, 4, 5]
    }

    hr_grid = GridXGB(s1, xgb_param_grid)
    hr_grid_df = pd.DataFrame(hr_grid.cv_results_).sort_values(by='rank_test_score')
    print(f'Best MAE: {hr_grid_df.iloc[0]["mean_test_score"]}')

    s1 = xgb_clf
    s1.set_params(**hr_grid.best_params_)
    print(f'max_depth, min_child_weight FINISHED !! Time Spent: {round((time.time() - start) / 60, 2)} mins')
    print(hr_grid.best_params_)

    xgb_param_grid  = {
        'gamma': params['gamma']
    }

    hr_grid = GridXGB(s1, xgb_param_grid)
    hr_grid_df = pd.DataFrame(hr_grid.cv_results_).sort_values(by='rank_test_score')
    print(f'Best MAE: {hr_grid_df.iloc[0]["mean_test_score"]}')

    s1 = xgb_clf
    s1.set_params(**hr_grid.best_params_)
    print(f'gamma FINISHED !! Time Spent: {round((time.time() - start) / 60, 2)} mins')
    print(hr_grid.best_params_)


    xgb_param_grid  = {
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree']
    }

    hr_grid = GridXGB(s1, xgb_param_grid)
    hr_grid_df = pd.DataFrame(hr_grid.cv_results_).sort_values(by='rank_test_score')
    print(f'Best MAE: {hr_grid_df.iloc[0]["mean_test_score"]}')

    s1 = xgb_clf
    s1.set_params(**hr_grid.best_params_)
    print(f'subsample, colsample_bytree FINISHED !! Time Spent: {round((time.time() - start) / 60, 2)} mins')
    print(hr_grid.best_params_)


    xgb_param_grid  = {
        'subsample': [i/100.0 for i in range(40,80)],
    }

    hr_grid = GridXGB(s1, xgb_param_grid)
    hr_grid_df = pd.DataFrame(hr_grid.cv_results_).sort_values(by='rank_test_score')
    print(f'Best MAE: {hr_grid_df.iloc[0]["mean_test_score"]}')

    s1 = xgb_clf
    s1.set_params(**hr_grid.best_params_)
    print(f'subsample FINISHED !! Time Spent: {round((time.time() - start) / 60, 2)} mins')
    print(hr_grid.best_params_)

    xgb_param_grid = {
     'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
    }

    hr_grid = GridXGB(s1, xgb_param_grid)
    hr_grid_df = pd.DataFrame(hr_grid.cv_results_).sort_values(by='rank_test_score')
    print(f'Best MAE: {hr_grid_df.iloc[0]["mean_test_score"]}')

    s1 = xgb_clf
    s1.set_params(**hr_grid.best_params_)
    print(f'Regularization FINISHED !! Time Spent: {round((time.time() - start) / 60, 2)} mins')
    print(hr_grid.best_params_)

    return hr_grid

In [54]:
rf_params = {
    'n_estimators': [10, 20, 30, 40, 50],
    'criterion': ['mae'],
    'max_depth': [20, 30, 40]
}

svr_params = {
    'kernel': ['rbf'],
    'C': [0.1, 1, 2, 3],
    'epsilon': [0.01, 0.1, 0.5],
}

xgb_params = {
    'max_depth': list(range(3, 11)),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    'gamma': [0, 3, 6, 9],
    'tree_method':['gpu_hist'],
    'gpu_id': [0],
    'learning_rate': [0.2, 0.05, 0.005],
    'n_estimators': [100, 200, 500]
}

In [55]:
result_rf = myRegressor(RandomForestRegressor(), rf_params)
result_svr = myRegressor(SVR(), svr_params)

  self.best_estimator_.fit(X, y, **fit_params)
소요시간: 2.56분
소요시간: 0.02분
  return f(*args, **kwargs)


In [57]:
result_xgb = finalxgb(30, xgb_params)

XGBoostError: [20:39:39] c:\ci\xgboost-split_1619728435298\work\src\common\common.h:156: XGBoost version not compiled with GPU support.

In [None]:
rf_model = RandomForestRegressor(**result_rf['params'].iloc[0])
rf_model = rf_model.fit(X, y)

svr_model = SVR(**result_svr['params'].iloc[0])
svr_model = svr_model.fit(X, y)

xgb_model = xgb.XGBRegressor(**result_xgb.best_params_)
xgb_model = xgb_model.fit(X, y)