In [1]:
from math import sin, cos, sqrt, atan2, radians
from datetime import datetime
from sklearn.cluster import KMeans

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 100

TRAIN = pd.read_csv('data/train.csv')
# drop duplicated row from train set
TRAIN = TRAIN.drop_duplicates(TRAIN.columns.difference(['key']))
TEST = pd.read_csv('data/test.csv')
subway=pd.read_csv('data/Subways.csv')
school=pd.read_csv('data/Schools.csv')

print("Train shape : ", TRAIN.shape)
print("Test shape : ", TEST.shape)
display(TRAIN.tail(2))
display(TEST.head(2))

FULL = pd.concat([TRAIN, TEST], axis = 0).reset_index(drop=True)
FULL = FULL.fillna(-999)

Train shape :  (1554911, 25)
Test shape :  (3918, 25)


Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price
1601456,1605366,3686,1,201810,21~31,1996,59.34,4,37.555215,127.131294,1174010700,111.0,107,1,19.0,11.0,individual,gas,165820,88.37,4,3.0,1.0,corridor,485000000
1601457,1605373,2937,1,201810,21~31,1999,84.88,5,37.604326,127.017168,1129013300,802.0,860,8,22.0,7.0,individual,gas,6279,108.75,209,3.0,2.0,stairway,430000000


Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price
0,462533,3751,1,200912,21~31,1984,83.58,14,37.519926,127.052515,1168010400,375.0,375,4,15.0,15.0,district,cogeneration,6648,107.19,2,3.0,1.0,corridor,0
1,764018,14029,1,201304,1~10,1968,107.17,10,37.572215,126.987643,1111013700,,149,1,15.0,8.0,individual,gas,46184,107.17,10,0.0,0.0,corridor,0


In [2]:
'''
Add encoded features for the 'apartment_id' and the 'room_id' that rank them with respect to coordinates.
'''
# rank the id features with respect to longitude
FULL = FULL.sort_values(by = ['longitude', 'supply_area', 'floor'])
FULL['long_apartment_id'] = pd.factorize(FULL['apartment_id'])[0]
FULL['long_room_id'] = pd.factorize(FULL['room_id'])[0]
FULL = FULL.loc[np.arange(0,FULL.shape[0]),:]

# rank the id features with respect to latitude
FULL = FULL.sort_values(by = ['latitude', 'supply_area', 'floor'])
FULL['lat_apartment_id'] = pd.factorize(FULL['apartment_id'])[0]
FULL['lat_room_id'] = pd.factorize(FULL['room_id'])[0]
FULL = FULL.loc[np.arange(0,FULL.shape[0]),:]

FULL.head()

Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id
0,0,5584,1,200601,11~20,1999,47.43,6,37.585965,127.000231,1111017100,163.0,136,1,8.0,4.0,individual,gas,91120,65.63,46,1.0,1.0,corridor,215000000,1243,6165,3251,16395
1,1,5584,1,200601,11~20,1999,44.37,8,37.585965,127.000231,1111017100,163.0,136,1,8.0,4.0,individual,gas,91119,61.39,10,2.0,1.0,corridor,200000000,1243,6164,3251,16394
2,2,5059,1,200601,11~20,1992,54.7,8,37.580511,127.014016,1111017400,902.0,585,5,14.0,9.0,individual,gas,8430,72.36,201,2.0,1.0,corridor,168000000,1357,6830,3203,16116
3,3,2816,1,200601,11~20,1993,64.66,11,37.580324,127.011788,1111017400,902.0,919,7,15.0,11.0,individual,gas,5839,87.3,284,2.0,1.0,corridor,165000000,1326,6634,3198,16089
4,4,2816,1,200601,11~20,1993,106.62,7,37.580324,127.011788,1111017400,902.0,919,7,15.0,11.0,individual,gas,5836,127.74,112,4.0,2.0,stairway,280000000,1326,6636,3198,16091


## TRAIN Set 분포 맞춰주기

## Target log scaling

In [3]:
FULL['log_target'] = np.log1p(FULL['transaction_real_price'])

## 법정동코드 쪼개기
**실행코드**

In [4]:
address_by_law = FULL['address_by_law'].astype('str')

address_by_law_1 = []
address_by_law_2 = []
address_by_law_3 = []
address_by_law_4 = []

for item in address_by_law:
    address_by_law_1.append(int(item[0:2]))
    address_by_law_2.append(int(item[2:5]))
    address_by_law_3.append(int(item[5:8]))
    address_by_law_4.append(int(item[8:10]))
    
FULL['address_by_law_1'] = address_by_law_1
FULL['address_by_law_2'] = address_by_law_2
FULL['address_by_law_3'] = address_by_law_3
FULL['address_by_law_4'] = address_by_law_4

**특이사항**

In [5]:
pd.value_counts(FULL['address_by_law'])

1135010500    45103
2629010700    44131
2635010700    32718
2632010200    30000
1171010100    28557
1132010700    25050
1135010600    24417
1147010100    22502
1159010700    21808
1162010100    21423
2638010600    20874
1135010200    18153
2635010400    17652
2635010500    16943
1147010200    16827
1153010200    16256
2647010200    16110
2635010300    15975
2647010100    14432
2632010300    13737
1129013300    13679
2644010400    13549
1135010300    13302
1150010300    12478
1130510100    12163
1126010600    12067
1150010900    11778
2632010100    11422
1129013400    11370
2623010900    11275
              ...  
1120012200      114
2620011900      111
1168011300      111
2611012000      110
1117012100       95
2671031022       84
1111011800       78
1171010600       78
2671031021       78
2611011500       74
1141010200       73
1114013200       73
1114012200       69
2635010800       66
2614011600       63
1114011800       60
1129012600       58
1114013600       56
1141010400       53


In [6]:
FULL[FULL['address_by_law_1'] == 28]

Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4
1554970,1503614,22247,1,201712,1~10,2011,59.97,11,37.554659,126.746463,2824510900,413.0,376,8,15.0,10.0,individual,gas,21243,80.09,85,3.0,2.0,stairway,0,0,0,2840,14322,0.0,28,245,109,0


In [7]:
FULL[FULL['address_by_law'] == 1111018000]

Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4
1557593,1603034,36841,1,201809,1~10,2017,45.879,13,37.571644,126.96138,1111018000,222.0,182,3,19.0,9.0,-999,-999,152279,67.94,75,2.0,1.0,stairway,0,1089,5234,3076,15486,0.0,11,110,180,0


인천 하나 있다.

## 시간 데이터

In [8]:
new_date = FULL["transaction_date"].replace({'1~10':'01', '11~20':'10', '21~28':'20', '21~29':'20', '21~30':'20',  '21~31':'20' })
new_date = FULL["transaction_year_month"].astype('str') + new_date
new_date = pd.to_datetime(new_date, format = '%Y%m%d')
new_date = (new_date - pd.to_datetime("2006-01-01")).dt.days
FULL['date'] = new_date

In [9]:
FULL['transaction_year'] = FULL['transaction_year_month'].astype(str).map(lambda x: int(x[0:4]))
FULL['transaction_month'] = FULL['transaction_year_month'].astype(str).map(lambda x: int(x[4:6]))

## 아파트 나이 : transaction_year - year_of_completion

In [10]:
FULL['age'] = FULL['transaction_year'] - FULL['year_of_completion']

## 주거공용면적 : supply_area - exclusive_use_area

In [11]:
FULL['common_area'] = FULL['supply_area'] - FULL['exclusive_use_area']

## 층 / 단지 최고층 :  floor / tallest_building_in_sites

In [12]:
FULL['floor/tallest'] = FULL['floor'] / FULL['tallest_building_in_sites']

## 층 / 단지 최저층 :  floor / lowest_building_in_sites

In [13]:
FULL['floor/lowest'] = FULL['floor'] / FULL['lowest_building_in_sites']

## 카테고리 데이터 label encoding

In [14]:
FULL['transaction_date_label_encoding'] = FULL['transaction_date'].replace({'1~10':0,'11~20':1,'21~31':2,'21~30':2,'21~29':2,'21~28':2})

label, unique=pd.factorize(FULL['heat_type'])
FULL['heat_type_label_encoding']=label

label2, unique2=pd.factorize(FULL['heat_fuel'])
FULL['heat_fuel_label_encoding']=label2

label3, unique3 = pd.factorize(FULL['front_door_structure'])
FULL['front_door_structure_label_encoding']=label3

In [15]:
sub_cnt=[]
sch_cnt=[]

A=FULL.drop_duplicates('address_by_law')
A.reset_index(inplace=True)
for i in range(A.shape[0]):
    sub_cnt.append(sum(subway['address_by_law']==A['address_by_law'][i]))
    sch_cnt.append(sum(school['address_by_law']==A['address_by_law'][i]))
A['sub_cnt']=sub_cnt
A['sch_cnt']=sch_cnt
FULL=pd.merge(FULL,A[['address_by_law','sub_cnt','sch_cnt']],left_on='address_by_law',right_on='address_by_law',how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


## K mean Clustering

In [16]:
X = FULL.drop_duplicates('apartment_id')
X_S = X[X['city']==1]
X_B = X[X['city']==0]
COOR_S = X_S[['apartment_id','latitude','longitude']]
COOR_B = X_B[['apartment_id','latitude','longitude']]

kmeans_S = KMeans(n_clusters=70, random_state=0).fit(COOR_S[['latitude','longitude']])
kmeans_B = KMeans(n_clusters=50, random_state=0).fit(COOR_B[['latitude','longitude']])

In [17]:
COOR_S['cluster_N']=kmeans_S.labels_
COOR_B['cluster_N']=kmeans_B.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
COOR_B['cluster_N']+=70
COOR=pd.concat([COOR_S,COOR_B])
COOR = COOR.drop(labels = ['latitude' , 'longitude'], axis = 1)
FULL=pd.merge(FULL,COOR,left_on='apartment_id',right_on='apartment_id',how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
FULL.head()

Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4,date,transaction_year,transaction_month,age,common_area,floor/tallest,floor/lowest,transaction_date_label_encoding,heat_type_label_encoding,heat_fuel_label_encoding,front_door_structure_label_encoding,sub_cnt,sch_cnt,cluster_N
0,0,5584,1,200601,11~20,1999,47.43,6,37.585965,127.000231,1111017100,163.0,136,1,8.0,4.0,individual,gas,91120,65.63,46,1.0,1.0,corridor,215000000,1243,6165,3251,16395,19.186149,11,110,171,0,9,2006,1,7,18.2,0.75,1.5,1,0,0,0,0,0,65
1,1,5584,1,200601,11~20,1999,44.37,8,37.585965,127.000231,1111017100,163.0,136,1,8.0,4.0,individual,gas,91119,61.39,10,2.0,1.0,corridor,200000000,1243,6164,3251,16394,19.113828,11,110,171,0,9,2006,1,7,17.02,1.0,2.0,1,0,0,0,0,0,65
2,2,5059,1,200601,11~20,1992,54.7,8,37.580511,127.014016,1111017400,902.0,585,5,14.0,9.0,individual,gas,8430,72.36,201,2.0,1.0,corridor,168000000,1357,6830,3203,16116,18.939475,11,110,174,0,9,2006,1,14,17.66,0.571429,0.888889,1,0,0,0,1,3,65
3,3,2816,1,200601,11~20,1993,64.66,11,37.580324,127.011788,1111017400,902.0,919,7,15.0,11.0,individual,gas,5839,87.3,284,2.0,1.0,corridor,165000000,1326,6634,3198,16089,18.921456,11,110,174,0,9,2006,1,13,22.64,0.733333,1.0,1,0,0,0,1,3,65
4,4,2816,1,200601,11~20,1993,106.62,7,37.580324,127.011788,1111017400,902.0,919,7,15.0,11.0,individual,gas,5836,127.74,112,4.0,2.0,stairway,280000000,1326,6636,3198,16091,19.4503,11,110,174,0,9,2006,1,13,21.12,0.466667,0.636364,1,0,0,1,1,3,65


## Feature Engineering for Subways and Schools data

In [20]:
'''
Feature engineering for school data
1. Label encoding for categorical data :
  - 'school_class', 'operation_type', 'highschool_type', 'gender'
  
2. Feature generation
  - foundation_data => foundation_year
  - combination of 'school_class' and 'operation_type' => class_operation, and then do label encoding
'''

# 1. label encoding for categorical data
school = school.sort_values(by = ['longitude'])
school['long_school_code'] = pd.factorize(school['school_code'])[0]
school = school.loc[np.arange(0,school.shape[0]),:]

school = school.sort_values(by = ['latitude'])
school['lat_school_code'] = pd.factorize(school['school_code'])[0]
school = school.loc[np.arange(0,school.shape[0]),:]

school['school_class'] = pd.factorize(school['school_class'])[0]
school['operation_type'] = pd.factorize(school['operation_type'])[0]
school['highschool_type'] = pd.factorize(school['highschool_type'])[0]
school['gender'] = pd.factorize(school['gender'])[0]

# 2. feature generation for school data
school['foundation_year'] = school['foundation_date'].map(lambda x : int(x[0:4]))
class_operation = []
for idx, row in school.iterrows():
    class_operation.append(str(row['school_class']) + str(row['operation_type']))
school['class_operation'] = pd.factorize(class_operation)[0]
school.head()

Unnamed: 0,school_code,latitude,longitude,school_class,operation_type,highschool_type,gender,foundation_date,address_by_law,long_school_code,lat_school_code,foundation_year,class_operation
0,S000003511,37.49088,127.015082,0,0,-1,0,1953.1.31,1165010800,682,802,1953,0
1,S000003563,37.577782,127.002915,0,0,-1,0,1946.8.22,1111016800,624,1495,1946,0
2,S010000737,37.481366,127.059055,0,1,-1,0,1982.9.20,1168010300,945,729,1982,1
3,S010000738,37.485744,127.058002,0,1,-1,0,1987.11.17,1168010300,934,757,1987,1
4,S010000741,37.480805,127.051891,0,1,-1,0,1983.11.30,1168010300,890,724,1983,1


In [21]:
# Find the nearest subway station and school for each apartment and calculate its distance.
def earth_distance(lat1, lat2, lon1, lon2):
    R=6373.0
    distance=[0]*len(lat2)
    for i in range(len(lat2)):
        d_lon= lon2[i]- lon1
        d_lat= lat2[i]- lat1 
        a = sin(d_lat / 2)**2 + cos(lat1) * cos(lat2[i]) * sin(d_lon / 2)**2 
        c = 2 * atan2(sqrt(a), sqrt(1 - a)) 
        distance[i] = R * c 
    return distance

In [22]:
# Make the meta data that map the subway stations and the schools to the apartment_ids
meta_apartment = FULL.drop_duplicates(subset = ['apartment_id']).reset_index(drop = True)
meta_apartment = meta_apartment[['apartment_id', 'latitude', 'longitude', 'address_by_law']]
meta_apartment

Unnamed: 0,apartment_id,latitude,longitude,address_by_law
0,5584,37.585965,127.000231,1111017100
1,5059,37.580511,127.014016,1111017400
2,2816,37.580324,127.011788,1111017400
3,2815,37.575381,126.960804,1111018700
4,9867,37.559200,127.019503,1114016200
5,2818,37.555060,127.014495,1114016200
6,2817,37.549828,127.009284,1114016200
7,2819,37.558170,127.017896,1114016200
8,4059,37.558116,126.965304,1114017100
9,1470,37.537397,127.097622,1121510300


In [23]:
# Check what the kinds of subway_lines are there.
lines = []
for idx, row in subway.iterrows():
    for line in row['subway_line'].split(','):
        lines.append(line)
display(pd.value_counts(lines).index.sort_values())

subway = subway.sort_values(by = ['longitude'])
subway['long_station_id'] = pd.factorize(subway['station_id'])[0]
subway = subway.loc[np.arange(0,subway.shape[0]),:]

subway = subway.sort_values(by = ['latitude'])
subway['lat_station_id'] = pd.factorize(subway['station_id'])[0]
subway = subway.loc[np.arange(0,subway.shape[0]),:]

dic = {}
for line in pd.value_counts(lines).index.sort_values():
    dic[line] = [0] * subway.shape[0]

for idx, row in subway.iterrows():
    for line in row['subway_line'].split(','):
        dic[line][idx] = 1 

subway = pd.concat([subway, pd.DataFrame(dic)], axis = 1)
subway.head()

Index(['1', '2', '3', '4', '5', '6', '7', '8', '9', 'AP', 'B1', 'B2', 'B3',
       'B4', 'BD', 'BK', 'DL', 'KC', 'KJ', 'ND', 'US'],
      dtype='object')

Unnamed: 0,station_id,latitude,longitude,subway_line,address_by_law,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US
0,1,37.555729,126.972145,"1,4,KJ,AP",1114012000.0,105,285,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
1,2,37.565624,126.976936,12,1114017000.0,110,314,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,37.570169,126.983099,1,1111013000.0,116,328,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,37.57157,126.991895,135,1111016000.0,124,331,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,37.570988,127.001921,1,1111016000.0,133,329,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# Map the nearest subwas station and school to each apartment
# WARN : It takes long time.( about 4 minutes)
dic = {'min_school' : [], 'min_school_dist' : [],
       'min_subway' : [], 'min_subway_dist' : []}

now = datetime.now()
for apt_id, row in meta_apartment.iterrows():
    sch_dist = earth_distance(row['latitude'], school['latitude'],
                              row['longitude'], school['longitude'])
    sub_dist = earth_distance(row['latitude'], subway['latitude'],
                              row['longitude'], subway['longitude'])
    
    dic['min_school'].append(school.loc[np.argmin(sch_dist), 'school_code']) 
    dic['min_school_dist'].append(np.min(sch_dist))
    
    dic['min_subway'].append(subway.loc[np.argmin(sub_dist), 'station_id']) 
    dic['min_subway_dist'].append(np.min(sub_dist))
    
print(datetime.now() - now)

0:03:18.096648


In [25]:
meta_apartment = pd.concat([meta_apartment, pd.DataFrame(dic)], axis = 1)
meta_apartment = meta_apartment[['apartment_id', 'min_school', 'min_school_dist', 'min_subway', 'min_subway_dist']]
meta_apartment = pd.merge(left = meta_apartment, right = school, left_on = 'min_school', right_on = 'school_code', how = 'left')
meta_apartment = pd.merge(left = meta_apartment, right = subway, left_on = 'min_subway', right_on = 'station_id', how = 'left')

meta_apartment.head()

Unnamed: 0,apartment_id,min_school,min_school_dist,min_subway,min_subway_dist,school_code,latitude_x,longitude_x,school_class,operation_type,highschool_type,gender,foundation_date,address_by_law_x,long_school_code,lat_school_code,foundation_year,class_operation,station_id,latitude_y,longitude_y,subway_line,address_by_law_y,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US
0,5584,S010002342,14.901992,102,24.882336,S010002342,37.585657,127.002564,1,2,-1,1,1907.9.8,1111016900,621,1556,1907,4,102,37.582418,127.001873,4,1111017000.0,132,350,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,5059,S010002378,6.109126,381,8.067028,S010002378,37.581469,127.013964,0,1,-1,0,1971.10.6,1111017400,674,1523,1971,1,381,37.580002,127.015182,6,1111017000.0,147,345,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2816,S010000644,8.018905,381,21.581305,S010000644,37.579517,127.01276,2,2,2,2,1966.12.8,1111017400,664,1510,1966,6,381,37.580002,127.015182,6,1111017000.0,147,345,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2815,S010002376,8.679184,67,19.187394,S010002376,37.574137,126.960247,0,1,-1,0,1957.9.28,1111018700,499,1473,1957,1,67,37.574453,126.957918,3,1141011000.0,94,337,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,9867,S010002337,7.31498,343,31.254636,S010002337,37.558278,127.020193,1,1,-1,2,1968.8.6,1114016200,708,1352,1968,3,343,37.554467,127.0208,5,1120011000.0,157,281,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
FULL = pd.merge(left = FULL, right = meta_apartment, on = 'apartment_id', how = 'left')
print(FULL.shape)
FULL.head()

(1558829, 93)


Unnamed: 0,key,apartment_id,city,transaction_year_month,transaction_date,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,heat_type,heat_fuel,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,front_door_structure,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4,date,transaction_year,transaction_month,age,common_area,floor/tallest,floor/lowest,transaction_date_label_encoding,heat_type_label_encoding,heat_fuel_label_encoding,front_door_structure_label_encoding,sub_cnt,sch_cnt,cluster_N,min_school,min_school_dist,min_subway,min_subway_dist,school_code,latitude_x,longitude_x,school_class,operation_type,highschool_type,gender,foundation_date,address_by_law_x,long_school_code,lat_school_code,foundation_year,class_operation,station_id,latitude_y,longitude_y,subway_line,address_by_law_y,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US
0,0,5584,1,200601,11~20,1999,47.43,6,37.585965,127.000231,1111017100,163.0,136,1,8.0,4.0,individual,gas,91120,65.63,46,1.0,1.0,corridor,215000000,1243,6165,3251,16395,19.186149,11,110,171,0,9,2006,1,7,18.2,0.75,1.5,1,0,0,0,0,0,65,S010002342,14.901992,102,24.882336,S010002342,37.585657,127.002564,1,2,-1,1,1907.9.8,1111016900,621,1556,1907,4,102,37.582418,127.001873,4,1111017000.0,132,350,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,5584,1,200601,11~20,1999,44.37,8,37.585965,127.000231,1111017100,163.0,136,1,8.0,4.0,individual,gas,91119,61.39,10,2.0,1.0,corridor,200000000,1243,6164,3251,16394,19.113828,11,110,171,0,9,2006,1,7,17.02,1.0,2.0,1,0,0,0,0,0,65,S010002342,14.901992,102,24.882336,S010002342,37.585657,127.002564,1,2,-1,1,1907.9.8,1111016900,621,1556,1907,4,102,37.582418,127.001873,4,1111017000.0,132,350,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,5059,1,200601,11~20,1992,54.7,8,37.580511,127.014016,1111017400,902.0,585,5,14.0,9.0,individual,gas,8430,72.36,201,2.0,1.0,corridor,168000000,1357,6830,3203,16116,18.939475,11,110,174,0,9,2006,1,14,17.66,0.571429,0.888889,1,0,0,0,1,3,65,S010002378,6.109126,381,8.067028,S010002378,37.581469,127.013964,0,1,-1,0,1971.10.6,1111017400,674,1523,1971,1,381,37.580002,127.015182,6,1111017000.0,147,345,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,2816,1,200601,11~20,1993,64.66,11,37.580324,127.011788,1111017400,902.0,919,7,15.0,11.0,individual,gas,5839,87.3,284,2.0,1.0,corridor,165000000,1326,6634,3198,16089,18.921456,11,110,174,0,9,2006,1,13,22.64,0.733333,1.0,1,0,0,0,1,3,65,S010000644,8.018905,381,21.581305,S010000644,37.579517,127.01276,2,2,2,2,1966.12.8,1111017400,664,1510,1966,6,381,37.580002,127.015182,6,1111017000.0,147,345,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,2816,1,200601,11~20,1993,106.62,7,37.580324,127.011788,1111017400,902.0,919,7,15.0,11.0,individual,gas,5836,127.74,112,4.0,2.0,stairway,280000000,1326,6636,3198,16091,19.4503,11,110,174,0,9,2006,1,13,21.12,0.466667,0.636364,1,0,0,1,1,3,65,S010000644,8.018905,381,21.581305,S010000644,37.579517,127.01276,2,2,2,2,1966.12.8,1111017400,664,1510,1966,6,381,37.580002,127.015182,6,1111017000.0,147,345,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Drop String Features

In [27]:
dtype_df = FULL.dtypes
str_idx = dtype_df[dtype_df.values == 'object'].index.tolist()
display(str_idx)
FULL = FULL.drop(str_idx, axis = 1)

['transaction_date',
 'heat_type',
 'heat_fuel',
 'front_door_structure',
 'min_school',
 'school_code',
 'foundation_date',
 'subway_line']

## Seperate FULL data into TRAIN, VAL, TEST

In [28]:
val_key = pd.read_csv("val_key.csv")
val_key = val_key[val_key['val_key'] > 0].val_key.tolist()

TRAIN = FULL.loc[:TRAIN.shape[0] - 1,:]
TEST = FULL.loc[TRAIN.shape[0]:,:].reset_index(drop = True)
display(TRAIN.tail(2))
display(TEST.head(2))

VAL = TRAIN[TRAIN['key'].isin(val_key)]
train = TRAIN.drop(labels = VAL.index, axis = 0)

print("TRAIN shape : ", train.shape)
print("VAL shape : ", VAL.shape)
print("TEST shape : ", TEST.shape)

# train.to_csv('x_train.csv', encoding = False, index = False)
# VAL.to_csv('x_val.csv', encoding = False, index = False)
# TEST.to_csv('x_test.csv', encoding = False, index = False)

Unnamed: 0,key,apartment_id,city,transaction_year_month,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4,date,transaction_year,transaction_month,age,common_area,floor/tallest,floor/lowest,transaction_date_label_encoding,heat_type_label_encoding,heat_fuel_label_encoding,front_door_structure_label_encoding,sub_cnt,sch_cnt,cluster_N,min_school_dist,min_subway,min_subway_dist,latitude_x,longitude_x,school_class,operation_type,highschool_type,gender,address_by_law_x,long_school_code,lat_school_code,foundation_year,class_operation,station_id,latitude_y,longitude_y,address_by_law_y,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US
1554909,1605366,3686,1,201810,1996,59.34,4,37.555215,127.131294,1174010700,111.0,107,1,19.0,11.0,165820,88.37,4,3.0,1.0,485000000,2508,12380,2853,14372,19.999659,11,740,107,0,4675,2018,10,22,29.03,0.210526,0.363636,2,0,0,0,1,6,34,11.534249,430,40.11608,37.555615,127.133078,1,1,-1,0,1174010700,1232,1324,1981,3,430,37.550148,127.127519,1174011000.0,270,273,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1554910,1605373,2937,1,201810,1999,84.88,5,37.604326,127.017168,1129013300,802.0,860,8,22.0,7.0,6279,108.75,209,3.0,2.0,430000000,1401,7112,3425,17199,19.879296,11,290,133,0,4675,2018,10,19,23.87,0.227273,0.714286,2,0,0,1,0,8,49,20.766537,99,52.420484,37.604377,127.013896,0,1,-1,0,1129013300,673,1655,1945,1,99,37.604152,127.025429,1129013000.0,161,370,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,key,apartment_id,city,transaction_year_month,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4,date,transaction_year,transaction_month,age,common_area,floor/tallest,floor/lowest,transaction_date_label_encoding,heat_type_label_encoding,heat_fuel_label_encoding,front_door_structure_label_encoding,sub_cnt,sch_cnt,cluster_N,min_school_dist,min_subway,min_subway_dist,latitude_x,longitude_x,school_class,operation_type,highschool_type,gender,address_by_law_x,long_school_code,lat_school_code,foundation_year,class_operation,station_id,latitude_y,longitude_y,address_by_law_y,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US
0,462533,3751,1,200912,1984,83.58,14,37.519926,127.052515,1168010400,375.0,375,4,15.0,15.0,6648,107.19,2,3.0,1.0,0,1873,9457,2211,11144,0.0,11,680,104,0,1449,2009,12,25,23.61,0.933333,0.933333,2,2,1,0,2,4,42,27.924164,405,6.288654,37.517124,127.055939,2,1,0,1,1168010500,920,1027,1900,7,405,37.51912,127.051937,1168010000.0,200,216,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,764018,14029,1,201304,1968,107.17,10,37.572215,126.987643,1111013700,-999.0,149,1,15.0,8.0,46184,107.17,10,0.0,0.0,0,1209,5970,3087,15554,0.0,11,110,137,0,2647,2013,4,45,0.0,0.666667,1.25,0,0,0,0,0,0,53,15.749152,4,27.190797,37.574683,126.987782,0,1,-1,0,1111013400,579,1479,1894,1,4,37.57157,126.991895,1111016000.0,124,331,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


TRAIN shape :  (1551005, 85)
VAL shape :  (3906, 85)
TEST shape :  (3918, 85)


## 망작들

In [29]:
''''''
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

lr_data = train[train['transaction_year'] > 2012]
lr = np.array([-999] * FULL.shape[0]).astype('float32')

apartment_id_list = lr_data.drop_duplicates(subset = ['apartment_id'])['apartment_id'].values

for apartment_id in apartment_id_list:
    # build metadata
    meta_data = lr_data[lr_data['apartment_id'] == apartment_id][['transaction_year_month', 'supply_area', 'floor', 'log_target']]
    meta_data = meta_data[meta_data['log_target'] > 0].drop_duplicates()
    if meta_data.shape[0] > 25:
        # scaling
        scaler = StandardScaler()
        scaler.fit(meta_data.values[:, 0:3])
        x = scaler.transform(meta_data.values[:, 0:3])
        y = meta_data.values[:, 3]

        # model_fit
        model = LinearRegression()
        model.fit(x, y)

        # fit
        meta_test = FULL[FULL['apartment_id'] == apartment_id][['transaction_year_month', 'supply_area', 'floor', 'log_target']] 
        idx = meta_test.index
        meta_test = scaler.transform(meta_test.values[:, 0:3])
        lr_pred = model.predict(meta_test)
        lr[idx] = lr_pred
FULL['lr'] = lr
''''''

''

In [30]:
FULL.shape

(1558829, 86)

In [31]:
TRAIN = FULL.loc[:TRAIN.shape[0] - 1,:]
TEST = FULL.loc[TRAIN.shape[0]:,:].reset_index(drop = True)
display(TRAIN.tail(2))
display(TEST.head(2))

VAL = TRAIN[TRAIN['key'].isin(val_key)]
train = TRAIN.drop(labels = VAL.index, axis = 0)

print("TRAIN shape : ", TRAIN.shape)
print("VAL shape : ", VAL.shape)
print("TEST shape : ", TEST.shape)

train.to_csv('x_train.csv', encoding = False, index = False)
VAL.to_csv('x_val.csv', encoding = False, index = False)
TEST.to_csv('x_test.csv', encoding = False, index = False)

Unnamed: 0,key,apartment_id,city,transaction_year_month,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4,date,transaction_year,transaction_month,age,common_area,floor/tallest,floor/lowest,transaction_date_label_encoding,heat_type_label_encoding,heat_fuel_label_encoding,front_door_structure_label_encoding,sub_cnt,sch_cnt,cluster_N,min_school_dist,min_subway,min_subway_dist,latitude_x,longitude_x,school_class,operation_type,highschool_type,gender,address_by_law_x,long_school_code,lat_school_code,foundation_year,class_operation,station_id,latitude_y,longitude_y,address_by_law_y,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US,lr
1554909,1605366,3686,1,201810,1996,59.34,4,37.555215,127.131294,1174010700,111.0,107,1,19.0,11.0,165820,88.37,4,3.0,1.0,485000000,2508,12380,2853,14372,19.999659,11,740,107,0,4675,2018,10,22,29.03,0.210526,0.363636,2,0,0,0,1,6,34,11.534249,430,40.11608,37.555615,127.133078,1,1,-1,0,1174010700,1232,1324,1981,3,430,37.550148,127.127519,1174011000.0,270,273,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,19.754187
1554910,1605373,2937,1,201810,1999,84.88,5,37.604326,127.017168,1129013300,802.0,860,8,22.0,7.0,6279,108.75,209,3.0,2.0,430000000,1401,7112,3425,17199,19.879296,11,290,133,0,4675,2018,10,19,23.87,0.227273,0.714286,2,0,0,1,0,8,49,20.766537,99,52.420484,37.604377,127.013896,0,1,-1,0,1129013300,673,1655,1945,1,99,37.604152,127.025429,1129013000.0,161,370,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,19.837831


Unnamed: 0,key,apartment_id,city,transaction_year_month,year_of_completion,exclusive_use_area,floor,latitude,longitude,address_by_law,total_parking_capacity_in_site,total_household_count_in_sites,apartment_building_count_in_sites,tallest_building_in_sites,lowest_building_in_sites,room_id,supply_area,total_household_count_of_area_type,room_count,bathroom_count,transaction_real_price,long_apartment_id,long_room_id,lat_apartment_id,lat_room_id,log_target,address_by_law_1,address_by_law_2,address_by_law_3,address_by_law_4,date,transaction_year,transaction_month,age,common_area,floor/tallest,floor/lowest,transaction_date_label_encoding,heat_type_label_encoding,heat_fuel_label_encoding,front_door_structure_label_encoding,sub_cnt,sch_cnt,cluster_N,min_school_dist,min_subway,min_subway_dist,latitude_x,longitude_x,school_class,operation_type,highschool_type,gender,address_by_law_x,long_school_code,lat_school_code,foundation_year,class_operation,station_id,latitude_y,longitude_y,address_by_law_y,long_station_id,lat_station_id,1,2,3,4,5,6,7,8,9,AP,B1,B2,B3,B4,BD,BK,DL,KC,KJ,ND,US,lr
0,462533,3751,1,200912,1984,83.58,14,37.519926,127.052515,1168010400,375.0,375,4,15.0,15.0,6648,107.19,2,3.0,1.0,0,1873,9457,2211,11144,0.0,11,680,104,0,1449,2009,12,25,23.61,0.933333,0.933333,2,2,1,0,2,4,42,27.924164,405,6.288654,37.517124,127.055939,2,1,0,1,1168010500,920,1027,1900,7,405,37.51912,127.051937,1168010000.0,200,216,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-999.0
1,764018,14029,1,201304,1968,107.17,10,37.572215,126.987643,1111013700,-999.0,149,1,15.0,8.0,46184,107.17,10,0.0,0.0,0,1209,5970,3087,15554,0.0,11,110,137,0,2647,2013,4,45,0.0,0.666667,1.25,0,0,0,0,0,0,53,15.749152,4,27.190797,37.574683,126.987782,0,1,-1,0,1111013400,579,1479,1894,1,4,37.57157,126.991895,1111016000.0,124,331,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-999.0


TRAIN shape :  (1554911, 86)
VAL shape :  (3906, 86)
TEST shape :  (3918, 86)
