In [257]:
%matplotlib notebook
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split
import sys
import math
import warnings
warnings.filterwarnings('ignore')

In [259]:
df = pd.read_csv("new_dtrain.csv", delimiter=",")
zeros = df[df['latitude'] == 0]
df = df[df['latitude'] != 0]
subset = list(df.columns.values)
subset = subset.remove('price')
data = df.drop_duplicates(subset = subset)
data = data[data.price < 1000000000]
data = data[data.price < 800000000]
data = shuffle(data)
data.replace(-1, 0, inplace=True)
data = data.reset_index(drop=True)

In [260]:
#choosing data
X = data[['room_number', 'house_type', 'district',
       'built_time', 'appartments_floor', 'all_space', 'state', 'bathroom',
       'balcony', 'balcony_glassed', 'door', 'phone', 'ceiling', 'safety',
       'at_the_hotel', 'internet', 'furniture', 'parking', 'latitude',
       'longitude', 'building_floors', 'map_complex', 'floor',
          'trngl_first_point', 'trngl_second_point', 'trngl_third_point']]
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [261]:
out = X_test[-1:]

In [262]:
out.index[0]

29177

In [263]:
out.values

array([[3, 2, 4, 1995, 5.0, 70.0, 6, 2, 2, 0, 0, 0, 0.0, 0, 0, 0, 0.0, 0,
        43.25780200000001, 76.930571, 9.0, 0.0, 0, 10.25720000000021,
        -15.81559999999982, -1.2213000000009802]], dtype=object)

In [264]:
data[data.index == out.index[0]]

Unnamed: 0,Index,district,address,room_number,price,map_complex,house_type,built_time,appartments_floor,all_space,...,internet,furniture,floor,parking,latitude,longitude,building_floors,trngl_first_point,trngl_second_point,trngl_third_point
29177,18077,4,Айтеке би (Октябрьская) 99 — Байтурсынова (К...,3,26500000,0.0,2,1995,5.0,70.0,...,0,0.0,0,0,43.257802,76.930571,9.0,10.2572,-15.8156,-1.2213


In [265]:
from math import sin, cos, sqrt, atan2, radians
lat = X_train['latitude'].values
lon = X_train['longitude'].values
out_lat = out['latitude'].values
out_lon = out['longitude'].values

indx = X_train.index

answer = np.zeros(len(lat))

for i in range(len(lat)):
    R = 6373.0
    lat1 = radians(lat[i])
    lon1 = radians(lon[i])
    lat2 = radians(out_lat)
    lon2 = radians(out_lon)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    distance = R * c
    
    answer[i] = distance

In [266]:
newDf = pd.DataFrame()

In [267]:
newDf['distance'] = answer
newDf['indx'] = indx

In [268]:
newDf = newDf.sort_values(['distance'], ascending=True)
newDf = newDf[100:]

In [269]:
knn = KNeighborsRegressor(n_neighbors=15, n_jobs=-1)

In [270]:
mod_df = pd.DataFrame()
ind = newDf.indx

In [271]:
ind.values

array([25672,  6238, 15510, ..., 23322, 18184, 15825])

In [272]:
tofit = data.loc[ind.values]

In [273]:
data[data.index == ind.values[0]]

Unnamed: 0,Index,district,address,room_number,price,map_complex,house_type,built_time,appartments_floor,all_space,...,internet,furniture,floor,parking,latitude,longitude,building_floors,trngl_first_point,trngl_second_point,trngl_third_point
25672,32139,4,Гоголя 20 — Байтурсынова (Космонавтов),3,17800000,0.0,2,1980,3.0,63.0,...,0,0.5,0,0,43.25819,76.925945,5.0,10.681,-15.3918,-0.7975


In [279]:
X_fit = tofit[['room_number', 'house_type', 'district',
       'built_time', 'appartments_floor', 'all_space', 'state', 'bathroom',
       'balcony', 'balcony_glassed', 'door', 'phone', 'ceiling', 'safety',
       'at_the_hotel', 'internet', 'furniture', 'parking', 'latitude',
       'longitude', 'building_floors', 'map_complex', 'floor',
          'trngl_first_point', 'trngl_second_point', 'trngl_third_point']]
y_fit = tofit['price']

X_fit.replace(' С ', '0', inplace=True)

In [280]:
knn.fit(X_fit, y_fit)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
          weights='uniform')

In [281]:
knn.predict(out)

array([ 25679720.])

In [282]:
y_test[-1:]

29177    26500000
Name: price, dtype: int64

In [283]:
abs(knn.predict(out) - y_test[-1:])

29177    820280.0
Name: price, dtype: float64

In [284]:
out

Unnamed: 0,room_number,house_type,district,built_time,appartments_floor,all_space,state,bathroom,balcony,balcony_glassed,...,furniture,parking,latitude,longitude,building_floors,map_complex,floor,trngl_first_point,trngl_second_point,trngl_third_point
29177,3,2,4,1995,5.0,70.0,6,2,2,0,...,0.0,0,43.257802,76.930571,9.0,0.0,0,10.2572,-15.8156,-1.2213


In [285]:
knn.kneighbors(out)

(array([[ 2.61350346,  5.40648686,  5.59127843,  6.30261621,  6.50223085,
          6.53340046,  6.56966125,  6.77307381,  6.77307381,  6.77307381,
          6.77307381,  7.38325094,  7.4876235 ,  7.50143189,  7.58335234]]),
 array([[  176,  6070,  3970,    27,  1935,  2069,  4285,  1819,  1821,
          1822,  1823,  3968,  2606, 10658,   165]]))

In [288]:
data[data.index == 176]

Unnamed: 0,Index,district,address,room_number,price,map_complex,house_type,built_time,appartments_floor,all_space,...,internet,furniture,floor,parking,latitude,longitude,building_floors,trngl_first_point,trngl_second_point,trngl_third_point
176,17045,4,Наурызбай батыра 28 — проспект Жибек Жолы,1,11000000,0.0,2,1985,3.0,33.0,...,2,0.5,0,1,43.26253,76.935622,5.0,9.27932,-16.79348,-2.19918


In [289]:
data[data.index == 6070]

Unnamed: 0,Index,district,address,room_number,price,map_complex,house_type,built_time,appartments_floor,all_space,...,internet,furniture,floor,parking,latitude,longitude,building_floors,trngl_first_point,trngl_second_point,trngl_third_point
6070,26631,1,"мкр №1, Саина 74",1,9500000,0.0,2,1970,3.0,33.0,...,2,0.0,0,1,43.230339,76.844881,4.0,21.5725,-4.5003,10.094


In [290]:
data[data.index == 3970]

Unnamed: 0,Index,district,address,room_number,price,map_complex,house_type,built_time,appartments_floor,all_space,...,internet,furniture,floor,parking,latitude,longitude,building_floors,trngl_first_point,trngl_second_point,trngl_third_point
3970,2068,1,мкр Таугуль 3 — Шаляпина,3,24900000,0.0,4,1979,2.0,59.0,...,2,1.0,0,1,43.205531,76.836383,2.0,24.9031,-1.1697,13.4246


In [291]:
data[data.index == 27]

Unnamed: 0,Index,district,address,room_number,price,map_complex,house_type,built_time,appartments_floor,all_space,...,internet,furniture,floor,parking,latitude,longitude,building_floors,trngl_first_point,trngl_second_point,trngl_third_point
27,12166,6,"мкр Кокжиек, Мкр. Кокжиек 31",1,8400000,0.0,0,2010,5.0,40.0,...,2,0.5,0,1,43.358533,76.923007,6.0,0.9405,-25.1323,-10.538
