In [None]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
import requests
import geopy.distance
from tqdm import tqdm

def get_relevant_facilities_near_coords(lat, lon):
    # Переводим радиус в метры
    radius_m = 500

    # Формируем запрос для Overpass API, добавив более содержательные объекты
    overpass_url = "http://overpass-api.de/api/interpreter"
    overpass_query = f"""
    [out:json];
    (
      node["amenity"="cafe"](around:{radius_m},{lat},{lon});
      node["amenity"="school"](around:{radius_m},{lat},{lon});
      node["amenity"="hospital"](around:{radius_m},{lat},{lon});
      node["amenity"="fitness_centre"](around:{radius_m},{lat},{lon});
      node["amenity"="bus_stop"](around:{radius_m},{lat},{lon});
    );
    out body;
    """

    # Отправляем запрос
    response = requests.get(overpass_url, params={'data': overpass_query})

    # Если запрос успешен
    if response.status_code == 200:
        data = response.json()
        elements = data.get('elements', [])

        # Список для хранения расстояний до объектов
        distances = []

        # Проходим по всем найденным объектам
        for element in elements:
            # Получаем координаты объекта
            object_lat = element['lat']
            object_lon = element['lon']

            # Рассчитываем расстояние до объекта
            distance = geopy.distance.distance((lat, lon), (object_lat, object_lon)).km
            distances.append(distance)

        # Возвращаем количество объектов и их расстояния
        return len(elements)
    else:
        print(f"Ошибка запроса: {response.status_code}")
        return 0

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Радиус Земли в км
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c


In [None]:
data = pd.read_csv('prices_train.csv')

data = data.drop(columns=['Unnamed: 0'])

data = data.dropna()

data['distance_to_city_center'] = data.apply(lambda row: haversine(row['X5 latitude'], row['X6 longitude'], 24.97687750605377, 121.53868248906382), axis=1)

In [None]:
tqdm.pandas(desc="Поиск мест")

data['relevant_facilities'] = data.progress_apply(lambda row: get_relevant_facilities_near_coords(row['X5 latitude'], row['X6 longitude']), axis=1)

Поиск мест: 100%|██████████| 303/303 [11:47<00:00,  2.33s/it]


In [None]:
data['Year'] = data['X1 transaction date'].apply(lambda x: int(x))
data['Month'] = data['X1 transaction date'].apply(lambda x: round((x - int(x)) * 12))

data = data.drop(columns=['X1 transaction date'])

In [None]:
data.head()

Unnamed: 0,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area,distance_to_city_center,relevant_facilities,Year,Month
0,34.0,157.6052,7.0,24.96628,121.54196,39.1,1.223825,7,2013,1
1,13.3,561.9845,5.0,24.98746,121.54391,54.8,1.289295,10,2013,6
2,13.7,1236.564,1.0,24.97694,121.55391,30.6,1.534884,2,2012,11
3,8.5,104.8101,5.0,24.96674,121.54067,55.5,1.144904,6,2013,6
5,8.0,104.8101,5.0,24.96674,121.54067,51.8,1.144904,6,2012,11


In [None]:
from scipy.stats import zscore

def remove_outliers_zscore(df, threshold=3):
    # Применяем Z-score для каждого числового столбца
    df_zscore = df.select_dtypes(include=['float64', 'int64'])
    z_scores = zscore(df_zscore)

    # Отбираем только те строки, для которых Z-score менее заданного порога
    df_cleaned = df[(abs(z_scores) < threshold).all(axis=1)]

    return df_cleaned

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# data[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'relevant_facilities', 'distance_to_city_center']] = scaler.fit_transform(data[['X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'relevant_facilities', 'distance_to_city_center']])

X = data.drop('Y house price of unit area', axis=1)
y = data['Y house price of unit area']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание и обучение модели
model = LinearRegression()
model.fit(X_train, y_train)

# Прогнозирование
y_pred = model.predict(X_test)

# Оценка модели
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 41.90977276560005


In [None]:
data_test = pd.read_csv('prices_test.csv')

data_test.head()

Unnamed: 0.1,Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,0,2012.833,3.4,,,24.95744,121.53711
1,1,2013.083,34.8,405.2134,1.0,24.97349,121.53372
2,2,2013.5,4.1,2147.376,3.0,24.96299,121.51284
3,3,2012.917,18.9,1009.235,0.0,24.96357,121.54951
4,4,2013.417,3.9,2147.376,3.0,24.96299,121.51284


In [None]:
data_test = data_test.drop(columns=['Unnamed: 0'])

data_test = data_test.fillna(data_test.mean())

data_test['distance_to_city_center'] = data.apply(lambda row: haversine(row['X5 latitude'], row['X6 longitude'], 24.97687750605377, 121.53868248906382), axis=1)

In [None]:
tqdm.pandas(desc="Поиск relevant_facilities")

data_test['relevant_facilities'] = data_test.progress_apply(lambda row: get_relevant_facilities_near_coords(row['X5 latitude'], row['X6 longitude']), axis=1)

Поиск relevant_facilities: 100%|██████████| 83/83 [03:04<00:00,  2.23s/it]


In [None]:
data_test['Year'] = data_test['X1 transaction date'].apply(lambda x: int(x))
data_test['Month'] = data_test['X1 transaction date'].apply(lambda x: round((x - int(x)) * 12))

data_test = data_test.drop(columns=['X1 transaction date'])

In [None]:
data_test = data_test.fillna(data_test.mean())
data_test['Y house price of unit area'] = model.predict(data_test)

In [None]:
data_test['index'] = range(len(data_test))

data_test[['index', 'Y house price of unit area']].to_csv('result.csv', index=False)