In [11]:
import os
import kaggle
import zipfile
import requests
import numpy as np
import pandas as pd

import warnings

warnings.filterwarnings('ignore')

# Constants

In [None]:
MIN_AREA = 20  # Outlier range for floor area
MAX_AREA = 200

MIN_KITCHEN = 6  # Outlier range for kitchen area
MAX_KITCHEN = 30

MIN_PRICE = 1_500_000  # Outlier range for price
MAX_PRICE = 50_000_000

MIN_SQM_PRICE = 75_000  # Outlier range for price per sq. meter
MAX_SQM_PRICE = 250_000

TEST_SIZE = 0.1

# Features to use in Nearest Neighbours model.
FEATURES = ['geo_lat', 'geo_lon', 'building_type', 'level', 'levels',
            'area', 'kitchen_area', 'object_type', 'year', 'month',
            'level_to_levels', 'area_to_rooms']

In [None]:
os.chdir(os.path.dirname(os.getcwd()))

In [None]:
directory = f"{os.getcwd()}/data"
print(directory)

# Get raw datasets

In [None]:
from src.data import make_datasets

In [None]:
make_datasets

# Get interim datasets

In [None]:
def get_subways(path: str) -> pd.DataFrame:
    """
    Function create a DataFrame with metro stations' data
    :param path:
    :return df_subway:
    """
    if os.path.isfile(f"{path}/external/spb_subways.csv"):
        print(f"You already have the spb_subways dataset!")
        df_subway = pd.read_csv(f"{path}/external/spb_subways.csv")
        print(df_subway.head(5))

    else:
        overpass_url = "https://maps.mail.ru/osm/tools/overpass/api//interpreter"
        overpass_query = """
        [out:json];
        area["ISO3166-2"="RU-SPE"][admin_level=4];
        (node["station"="subway"](area);
         way["station"="subway"](area);
         rel["station"="subway"](area);
        );
        out center;
        """
        response = requests.get(overpass_url, params={"data": overpass_query})
        data = response.json()

        df_subway = pd.DataFrame(columns=["StationName", "lat", "lon"])

        for i, element in enumerate(data["elements"]):

            if element["type"] == "node":

                data = {
                    "StationName": [element["tags"]["name"]],
                    "lat": [element["lat"]],
                    "lon": [element["lon"]],
                }

                df_subway = pd.concat(
                    [df_subway, pd.DataFrame(data=data)], axis=0, ignore_index=True
                )
        df_subway.to_csv(f"{path}/external/spb_subways.csv", index=False)
        return df_subway

In [None]:
get_subways(directory)

# Clean data

In [None]:
df = pd.read_csv(f"{directory}/raw/df_spb.csv")
df.head(5)

# Get raw datasets

In [15]:
from src.data import make_datasets

In [16]:
make_datasets

<module 'src.data.make_datasets' from '/media/jayokocha/7ac94045-6379-4189-aecc-24cea88b8aa5/aleksandr/PycharmProjects/house-prediction/src/data/make_datasets.py'>

# Get interim datasets

In [7]:
def get_subways(path: str) -> pd.DataFrame:
    """
    Function create a DataFrame with metro stations' data
    :param path:
    :return df_subway:
    """
    if os.path.isfile(f"{path}/external/spb_subways.csv"):
        print(f"You already have the spb_subways dataset!")
        df_subway = pd.read_csv(f"{path}/external/spb_subways.csv")
        print(df_subway.head(5))

    else:
        overpass_url = "https://maps.mail.ru/osm/tools/overpass/api//interpreter"
        overpass_query = """
        [out:json];
        area["ISO3166-2"="RU-SPE"][admin_level=4];
        (node["station"="subway"](area);
         way["station"="subway"](area);
         rel["station"="subway"](area);
        );
        out center;
        """
        response = requests.get(overpass_url, params={"data": overpass_query})
        data = response.json()

        df_subway = pd.DataFrame(columns=["StationName", "lat", "lon"])

        for i, element in enumerate(data["elements"]):

            if element["type"] == "node":

                data = {
                    "StationName": [element["tags"]["name"]],
                    "lat": [element["lat"]],
                    "lon": [element["lon"]],
                }

                df_subway = pd.concat(
                    [df_subway, pd.DataFrame(data=data)], axis=0, ignore_index=True
                )
        df_subway.to_csv(f"{path}/external/spb_subways.csv", index=False)
        return df_subway

In [8]:
get_subways(directory)

You already have the spb_subways dataset!
     StationName        lat        lon
0  Петроградская  59.964915  30.312830
1    Горьковская  59.955156  30.319457
2     Спортивная  59.950128  30.289065
3     Чкаловская  59.959334  30.292420
4       Удельная  60.017931  30.318159


# Clean data

In [9]:
df = pd.read_csv(f"{directory}/raw/df_spb.csv")
df.head(5)

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,2018-02-19,20:00:21,59.805808,30.376141,2661,1,8,10,3,82.6,10.8,1
1,3600000,2018-03-04,20:52:38,59.875526,30.395457,2661,1,2,5,1,31.1,6.0,1
2,3200000,2018-04-10,16:10:12,59.827465,30.201827,2661,1,7,9,1,31.0,7.0,1
3,6500000,2018-04-23,11:47:58,59.988334,29.786928,2661,3,3,3,3,89.0,10.0,1
4,6300000,2018-04-28,17:47:24,59.911622,30.284556,2661,3,5,5,2,99.9,14.5,1


In [10]:
df.describe()

Unnamed: 0,price,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
count,461820.0,461820.0,461820.0,461820.0,461820.0,461820.0,461820.0,461820.0,461820.0,461820.0,461820.0
mean,8209775.0,59.932848,30.319601,2661.0,1.948281,7.582619,14.336625,1.778953,58.582237,12.749967,5.271231
std,14160360.0,0.084615,0.12012,0.0,0.954559,5.694418,7.073714,1.124851,42.138205,7.641011,4.94661
min,-1719935000.0,59.647383,29.509349,2661.0,0.0,1.0,1.0,-2.0,1.4,0.02,1.0
25%,4400000.0,59.863518,30.256876,2661.0,1.0,3.0,9.0,1.0,37.6,8.5,1.0
50%,5950000.0,59.939084,30.324338,2661.0,2.0,6.0,13.0,2.0,51.0,11.1,1.0
75%,8800000.0,59.999287,30.397938,2661.0,3.0,11.0,20.0,2.0,69.0,15.15,11.0
max,1945383000.0,60.241984,30.711428,2661.0,5.0,35.0,38.0,9.0,7856.0,1272.0,11.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 461820 entries, 0 to 461819
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   price          461820 non-null  int64  
 1   date           461820 non-null  object 
 2   time           461820 non-null  object 
 3   geo_lat        461820 non-null  float64
 4   geo_lon        461820 non-null  float64
 5   region         461820 non-null  int64  
 6   building_type  461820 non-null  int64  
 7   level          461820 non-null  int64  
 8   levels         461820 non-null  int64  
 9   rooms          461820 non-null  int64  
 10  area           461820 non-null  float64
 11  kitchen_area   461820 non-null  float64
 12  object_type    461820 non-null  int64  
dtypes: float64(4), int64(7), object(2)
memory usage: 45.8+ MB


In [12]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove outliers and updates DataFrame adding temporal features
    and ratios for "area" and "level" parameters.
    :param df:
    :return:
    """
    # Fix negative values
    df['rooms'] = df['rooms'].apply(lambda x: 0 if x < 0 else x)
    df['price'] = df['price'].abs()
    # Drop prices and area outliers
    df = df[(df['area'] <= MAX_AREA) & (df['area'] >= MIN_AREA)]
    df = df[(df['price'] <= MAX_PRICE) & (df['price'] >= MIN_PRICE)]
    # Drop outliers based on price per square meter
    df['sqm_price'] = df['price']/df['area']
    df = df[(df['sqm_price'] >= MIN_SQM_PRICE) & (df['sqm_price'] <= MAX_SQM_PRICE)]
    # Fix kitchen area
    df.loc[(df['kitchen_area'] >= MAX_KITCHEN) | (df['area'] <= MIN_AREA), 'kitchen_area'] = 0

    # Delete region and change data format
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df.drop(['date', 'time', 'region'], axis=1, inplace=True)
    # Apartment floor in relation to total number of floors
    df['levels_to_levels'] = df['level']/df['levels']
    df['area_to_rooms'] = (df['area']/df['rooms']).abs()
    # Fix division by zero.
    df.loc[df['area_to_rooms'] == np.inf, 'area_to_rooms'] = \
        df.loc[df['area_to_rooms'] == np.inf, 'area']

    return df

In [13]:
df = df.pipe(clean_data)

In [14]:
df.head(5)

Unnamed: 0,price,geo_lat,geo_lon,building_type,level,levels,rooms,area,kitchen_area,object_type,sqm_price,year,month,levels_to_levels,area_to_rooms
1,3600000,59.875526,30.395457,1,2,5,1,31.1,6.0,1,115755.62701,2018,3,0.4,31.1
2,3200000,59.827465,30.201827,1,7,9,1,31.0,7.0,1,103225.806452,2018,4,0.777778,31.0
5,7100000,60.011172,30.243795,3,7,12,3,69.0,11.0,1,102898.550725,2018,5,0.583333,23.0
6,8500000,59.849104,30.314112,3,14,14,2,60.0,11.0,1,141666.666667,2018,5,1.0,30.0
7,3830000,59.98149,30.37173,2,2,17,1,44.9,16.5,11,85300.668151,2018,6,0.117647,44.9


# Add features

In [15]:
subway = pd.read_csv(f"{directory}/external/spb_subways.csv")

In [16]:
subway.head()

Unnamed: 0,StationName,lat,lon
0,Петроградская,59.964915,30.31283
1,Горьковская,59.955156,30.319457
2,Спортивная,59.950128,30.289065
3,Чкаловская,59.959334,30.29242
4,Удельная,60.017931,30.318159
