In [2]:
%matplotlib notebook
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import pygeohash as pgh
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("train.csv", delimiter=",")
print(data.columns)
print(data['door'].unique())    

Index(['id', 'room_number', 'address', 'price', 'region', 'map_complex',
       'house_type', 'built_time', 'floor', 'all_space', 'living_space',
       'kitchen_space', 'state', 'bathroom', 'balcony', 'balcony_is_glazed',
       'door', 'phone', 'internet', 'parking', 'furniture', 'flooring',
       'ceiling', 'safety', 'at_the_hostel', 'description_option',
       'description_text', 'geocode_lat', 'geocode_long', 'last_update_time',
       'last_update_script'],
      dtype='object')
['бронированная' nan 'металлическая' 'деревянная']


In [4]:
def transfiguration(data):
    df = data[['address', 'price', 'region', 'map_complex', 'house_type', 'built_time', 'floor', 'all_space', 
         'state', 'bathroom', 'balcony', 'door', 'phone', 'internet', 'parking', 'furniture',
         'safety', 'at_the_hostel', 'geocode_lat', 'geocode_long']]
    
#     df['house_type'].replace(["монолитный", "панельный", "кирпичный", "иное", "каркасно-камышитовый"],
#                              [1, 2, 3, 4, 5],inplace=True)
#     df['state'].replace(["евроремонт", "хорошее", "черновая отделка", "требует ремонта", "среднее",
#                          "свободная планировка"],
#                              [1, 2, 3, 4, 5, 6],inplace=True)
    df['parking'].replace(["рядом охраняемая стоянка", "паркинг", "гараж"],
                             [1, 1, 1],inplace=True)
    df['furniture'].replace(["полностью меблирована", "частично меблирована", "пустая"],
                             [1, 0.5, 0],inplace=True)
#     df['door'].replace(["бронированная", "металлическая", "деревянная"],
#                              [1, 2, 3],inplace=True)
#     df['region'].replace(['Алматы, Бостандыкский р-н', 'Алматы, Жетысуский р-н',
#                          'Алматы, Ауэзовский р-н','Алматы, Алатауский р-н',
#                          'Алматы, Жетысуйский р-н', 'Алматы, Медеуский р-н',
#                          'Алматы, Наурызбайский р-н', 'Алматы, Алмалинский р-н',
#                         'Алматы, Турксибский р-н', 'Алматы','Казахстан'],[1,2,3,4,2,5,6,7,8,0,0],inplace=True)
    df['at_the_hostel'].replace(["нет", "да"],
                             [0, 1],inplace=True)
    df['balcony'].replace(["да", "балкон", "лоджия", "балкон и лоджия", "несколько балконов или лоджий"],
                             [1, 1, 1, 1, 1],inplace=True)
#     df['internet'].replace(["ADSL", "проводной", "через TV кабель", "оптика"],
#                              [1, 1, 2, 3],inplace=True)
#     df['bathroom'].replace(['раздельный', 'совмещенный', '2 с/у и более', 'нет'],
#                              [1, 2, 3, 4],inplace=True)
    df['phone'].replace(['отдельный', 'есть возможность подключения', 'нет', 'блокиратор'],
                             [1, 0, 0, 0], inplace=True)

    df['geocode_lat'] = df['geocode_lat'].convert_objects(convert_numeric = True)
    df['geocode_long'] = df['geocode_long'].convert_objects(convert_numeric = True)
    df['all_space'] = df['all_space'].str.replace('м2','')
    df['all_space'] = df['all_space'].convert_objects(convert_numeric = True)
    df['price'] = df['price'].apply(lambda x: x.replace("\xa0", ""))
    df['price'] = pd.to_numeric(df['price'])
    df.replace("None", 0, inplace=True)
    df = df.fillna(0)
    return df

def floor_divider(df):
    floor = df['floor']
    building_floor = np.zeros(len(floor))
    appartments_floor = np.zeros(len(floor))
    for i in range(len(floor)):
        if (floor[i] != 0):
            temp = floor[i]
            building_floor[i] = temp[-2:]
            appartments_floor[i] = temp[:2]
        else:
            building_floor[i] = 0
            appartments_floor[i] = 0


    df['appartments_floor'] = appartments_floor
    df['building_floors'] = building_floor
    return df

def built_time_divider(df):
    time = df['built_time']
    temp = np.zeros(len(time))
    for i in range(len(time)):
        if "г.п." in time[i]:
            temporary = time[i]
            temp[i] = temporary[:-4]
        else:
            temp[i] = time[i]

    df['built_time'] = temp
    return df

def calculate_price_for_sqr_meter(x):
    return (x['price'] + 0.0) / x['all_space']

def get_price_for_sqr_meter(dataframe):
    dataframe['price_for_sqr_meter'] = dataframe.apply(lambda x: calculate_price_for_sqr_meter(x), axis = 1)
    return dataframe

def make_geohash(data):
    lat = data['geocode_lat']
    long = data['geocode_long']
    geohash = []
    for i in range(len(lat)):
        geohash.append(pgh.encode(lat[i], long[i], precision=5))
    data['geohash'] = geohash
    data['geohash'] = pd.to_numeric(data['geohash'], errors='coerce').fillna(0).astype(np.int64)
    return data

def binning(df_X):
    for_binning = ['house_type', 'state', 'door', 'region', 'internet', 'bathroom']
    for c in for_binning:
        data_label = pd.get_dummies(df_X[c],prefix = c)
        df_X = df_X.drop(c,axis =1)
        #df_X = df_X.join(data_label, lsuffix='_df_X', rsuffix='_data_label')
        df_X = pd.concat([df_X, data_label], axis=1, join_axes=[df_X.index])
        print("Binned", c,'to ',data_label.columns)
    return df_X

def triangulation(df):
    latitude = df['geocode_lat']
    longitude = df['geocode_long']
    #Almaty - 1: 43.340777, 76.950168
    #Kalkaman - 1: 43.232742, 76.797475
    #Tausamal: 43.196848, 76.979312
    first_point = []
    second_point = []
    third_point = []
    for i in range(len(latitude)):
        first_point.append((43.340777 - latitude[i]) + (76.950168 - longitude[i]))
        second_point.append((43.232742 - latitude[i]) + (76.797475 - longitude[i]))
        third_point.append((43.196848 - latitude[i]) + (76.979312 - longitude[i]))
    
    df['trngl_first_point'] = first_point
    df['trngl_second_point'] = second_point
    df['trngl_third_point'] = third_point
    return df

In [5]:
df = transfiguration(data)
df = floor_divider(df)
df = built_time_divider(df)
df = get_price_for_sqr_meter(df)
df = make_geohash(df)
df = binning(df)
df = triangulation(df)

Binned house_type to  Index(['house_type_0', 'house_type_иное', 'house_type_каркасно-камышитовый',
       'house_type_кирпичный', 'house_type_монолитный',
       'house_type_панельный'],
      dtype='object')
Binned state to  Index(['state_0', 'state_евроремонт', 'state_свободная планировка',
       'state_среднее', 'state_требует ремонта', 'state_хорошее',
       'state_черновая отделка'],
      dtype='object')
Binned door to  Index(['door_0', 'door_бронированная', 'door_деревянная',
       'door_металлическая'],
      dtype='object')
Binned region to  Index(['region_Алматы', 'region_Алматы, Алатауский р-н',
       'region_Алматы, Алмалинский р-н', 'region_Алматы, Ауэзовский р-н',
       'region_Алматы, Бостандыкский р-н', 'region_Алматы, Жетысуйский р-н',
       'region_Алматы, Жетысуский р-н', 'region_Алматы, Медеуский р-н',
       'region_Алматы, Наурызбайский р-н', 'region_Алматы, Турксибский р-н',
       'region_Казахстан'],
      dtype='object')
Binned internet to  Index(['inter

In [8]:
df.head()

Unnamed: 0,address,price,map_complex,built_time,floor,all_space,balcony,phone,parking,furniture,...,internet_проводной,internet_через TV кабель,bathroom_0,bathroom_2 с/у и более,bathroom_нет,bathroom_раздельный,bathroom_совмещенный,trngl_first_point,trngl_second_point,trngl_third_point
0,Аскарова — Аль-Фараби,51975000,Аль-Фараби,2010.0,9 из 9,111.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,1,0,120.290945,120.030217,120.17616
1,Кабанбай батыра 49 — Луганского,90480000,Максима Резидентс,2014.0,2 из 9,135.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0.249555,-0.011173,0.13477
2,"мкр Кулагер, Серикова 10",8500000,0,2005.0,9 из 9,40.0,1.0,1.0,0.0,0.5,...,0,0,0,0,0,0,1,0.070514,-0.190214,-0.044271
3,"мкр Орбита-2, Навои 10 — Аль-Фараби",25000000,0,1976.0,4 из 8,70.1,1.0,1.0,1.0,0.5,...,1,0,0,0,0,1,0,0.190548,-0.07018,0.075763
4,Аль-Фараби — Мира,46590000,Нурлы Тау,2008.0,21 из 25,90.0,0.0,0.0,1.0,0.0,...,0,0,0,1,0,0,0,120.290945,120.030217,120.17616


In [9]:
df.to_csv("dtrain.csv", index=False)