https://www.kaggle.com/mrdaniilak/russia-real-estate-20182021

# 1. Importing Library and Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/Users/danielbeltsazar/DS-ML-DL Mini Projects/Real Estate Price/all_v2.csv')
df.head()

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,2018-02-19,20:00:21,59.805808,30.376141,2661,1,8,10,3,82.6,10.8,1
1,8650000,2018-02-27,12:04:54,55.683807,37.297405,81,3,5,24,2,69.1,12.0,1
2,4000000,2018-02-28,15:44:00,56.29525,44.061637,2871,1,5,9,3,66.0,10.0,1
3,1850000,2018-03-01,11:24:52,44.996132,39.074783,2843,4,12,16,2,38.0,5.0,11
4,5450000,2018-03-01,17:42:43,55.918767,37.984642,81,3,13,14,2,60.0,10.0,1


The dataset has 13 fields.

1. date - date of publication of the announcement;
2. time - the time when the ad was published;
3. geo_lat - Latitude
4. geo_lon - Longitude
5. region - Region of Russia. There are 85 subjects in the country in total.
6. building_type - Facade type. 0 - Other. 1 - Panel. 2 - Monolithic. 3 - Brick. 4 - Blocky. 5 - Wooden
7. object_type - Apartment type. 1 - Secondary real estate market; 2 - New building;
8. level - Apartment floor
9. levels - Number of storeys
10. rooms - the number of living rooms. If the value is "-1", then it means "studio apartment"
11. area - the total area of the apartment
12. kitchen_area - Kitchen area
13. price - Price. in rubles

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5477006 entries, 0 to 5477005
Data columns (total 13 columns):
 #   Column         Dtype  
---  ------         -----  
 0   price          int64  
 1   date           object 
 2   time           object 
 3   geo_lat        float64
 4   geo_lon        float64
 5   region         int64  
 6   building_type  int64  
 7   level          int64  
 8   levels         int64  
 9   rooms          int64  
 10  area           float64
 11  kitchen_area   float64
 12  object_type    int64  
dtypes: float64(4), int64(7), object(2)
memory usage: 543.2+ MB


In [4]:
df.describe()

Unnamed: 0,price,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
count,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0,5477006.0
mean,4422029.0,54.03826,53.24433,4307.141,1.948966,6.21453,11.39892,1.726173,53.91825,10.6284,3.945399
std,21507520.0,4.622758,20.74763,3308.05,1.038537,4.957419,6.535734,1.082133,33.35293,9.79238,4.558357
min,-2144967000.0,41.45906,19.8902,3.0,0.0,1.0,1.0,-2.0,0.07,0.01,1.0
25%,1950000.0,53.37768,37.7779,2661.0,1.0,2.0,5.0,1.0,38.0,7.0,1.0
50%,2990000.0,55.17139,43.06774,2922.0,2.0,5.0,10.0,2.0,48.02,9.7,1.0
75%,4802000.0,56.22613,65.64895,6171.0,3.0,9.0,16.0,2.0,63.13,12.7,11.0
max,2147484000.0,71.9804,162.5361,61888.0,5.0,39.0,39.0,10.0,7856.0,9999.0,11.0


# 2. Data Cleaning

## 2.1 Missing Value

In [5]:
df.isnull().sum()

price            0
date             0
time             0
geo_lat          0
geo_lon          0
region           0
building_type    0
level            0
levels           0
rooms            0
area             0
kitchen_area     0
object_type      0
dtype: int64

## 2.2. Price Values Checking

In [6]:
df[df['price']<=0]

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
43821,-1633967296,2018-09-14,10:48:27,57.983337,56.216044,5520,1,13,22,1,38.92,17.26,11
43822,-1633967296,2018-09-14,10:48:27,57.983337,56.216044,5520,1,14,22,1,38.92,17.26,11
179212,0,2018-10-01,08:59:36,53.327778,83.668570,6817,3,5,10,3,104.00,15.00,1
199895,-429242296,2018-10-04,09:21:35,55.038734,82.985600,9654,2,5,25,2,57.27,9.31,11
208483,-1744967296,2018-10-06,05:06:46,54.991330,82.882575,9654,3,4,5,2,45.00,6.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5382188,-2041757296,2021-04-23,07:00:09,57.636265,39.973962,2604,3,2,4,1,41.08,9.31,11
5382595,-1794967296,2021-04-23,07:37:00,56.115619,38.416266,81,1,4,5,2,56.00,9.00,1
5398253,-1494967296,2021-04-26,05:04:58,45.111303,36.881901,2843,3,3,3,1,45.00,17.00,11
5419426,-2094967296,2021-04-27,09:05:31,58.223183,68.305352,3991,3,14,19,1,30.00,8.00,1


### Apparently we have 388 rows data where our price values are non-positive. It doesn't make sense. So i will drop it anyway.

In [7]:
df = df[df['price']>0]
df.shape

(5476618, 13)

## 2.3 Re-encoding our dataset

### Our dataset consists categorical values that have been encoded. I want to make some analysis and to do so i need to re-encode or invers encode our dataset so we can easily make further analysis and visualization.

In [8]:
df_real = df.copy()

In [9]:
df_real['building_type']= df_real['building_type'].replace([0,1,2,3,4,5],['Other','Panel','Monolithic','Brick','Blocky','Wooden'])
df_real['object_type']= df_real['object_type'].replace([1,2,11],['Secondary Real Estate','New Building','Other(Unknown)'])


In [10]:
df_real.head()

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,2018-02-19,20:00:21,59.805808,30.376141,2661,Panel,8,10,3,82.6,10.8,Secondary Real Estate
1,8650000,2018-02-27,12:04:54,55.683807,37.297405,81,Brick,5,24,2,69.1,12.0,Secondary Real Estate
2,4000000,2018-02-28,15:44:00,56.29525,44.061637,2871,Panel,5,9,3,66.0,10.0,Secondary Real Estate
3,1850000,2018-03-01,11:24:52,44.996132,39.074783,2843,Blocky,12,16,2,38.0,5.0,Other(Unknown)
4,5450000,2018-03-01,17:42:43,55.918767,37.984642,81,Brick,13,14,2,60.0,10.0,Secondary Real Estate


## 2.4. Converting time and date to datetime data type

In [11]:
df_real['date']=pd.to_datetime(df_real['date'])
df_real['time']=pd.to_datetime(df_real['time'])

In [12]:
df_real.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5476618 entries, 0 to 5477005
Data columns (total 13 columns):
 #   Column         Dtype         
---  ------         -----         
 0   price          int64         
 1   date           datetime64[ns]
 2   time           datetime64[ns]
 3   geo_lat        float64       
 4   geo_lon        float64       
 5   region         int64         
 6   building_type  object        
 7   level          int64         
 8   levels         int64         
 9   rooms          int64         
 10  area           float64       
 11  kitchen_area   float64       
 12  object_type    object        
dtypes: datetime64[ns](2), float64(4), int64(5), object(2)
memory usage: 585.0+ MB


### I will divide our price value by 1 million so we can visualize it easier. It is not necessary. I do it because it will help me to visualize the price when i do some explorations.

In [13]:
df_real['price']=df_real['price']/1000000
df_real = df_real.rename(columns={'price':'price (Million)'})


In [14]:
df_real.head()

Unnamed: 0,price (Million),date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6.05,2018-02-19,2022-03-05 20:00:21,59.805808,30.376141,2661,Panel,8,10,3,82.6,10.8,Secondary Real Estate
1,8.65,2018-02-27,2022-03-05 12:04:54,55.683807,37.297405,81,Brick,5,24,2,69.1,12.0,Secondary Real Estate
2,4.0,2018-02-28,2022-03-05 15:44:00,56.29525,44.061637,2871,Panel,5,9,3,66.0,10.0,Secondary Real Estate
3,1.85,2018-03-01,2022-03-05 11:24:52,44.996132,39.074783,2843,Blocky,12,16,2,38.0,5.0,Other(Unknown)
4,5.45,2018-03-01,2022-03-05 17:42:43,55.918767,37.984642,81,Brick,13,14,2,60.0,10.0,Secondary Real Estate


## 2.5. Outlier Removing

### I usually put outlier removing process in feature preprocessing section before modelling. But this time i will do this process before we continue exploring our data. We have large dataset, it should be our concern before we do exploration and modelling.

### We will check our price data

In [15]:
df_real[df_real['price (Million)']>1000]

Unnamed: 0,price (Million),date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
900,1451.892000,2018-09-08,2022-03-05 01:54:34,55.013994,83.003319,9654,Panel,9,10,3,59.02,45.50,Other(Unknown)
908,1003.425000,2018-09-08,2022-03-05 01:54:43,55.013994,83.003319,9654,Panel,9,10,1,39.35,4.39,Other(Unknown)
5215,1003.425000,2018-09-09,2022-03-05 04:00:07,55.013994,83.003319,9654,Panel,9,10,1,39.35,4.39,Other(Unknown)
5224,1451.892000,2018-09-09,2022-03-05 04:00:16,55.013994,83.003319,9654,Panel,9,10,3,59.02,45.50,Other(Unknown)
9890,1451.892000,2018-09-10,2022-03-05 04:35:26,55.013994,83.003319,9654,Panel,9,10,3,59.02,45.50,Other(Unknown)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5131302,1500.000000,2021-03-11,2022-03-05 09:49:14,56.121198,47.180265,3019,Brick,8,9,1,31.00,5.60,Secondary Real Estate
5217895,1096.745000,2021-03-25,2022-03-05 14:25:21,55.758177,37.592653,3,Brick,2,7,5,420.00,52.00,Secondary Real Estate
5232384,1555.032704,2021-03-29,2022-03-05 12:22:05,56.319472,44.054239,2871,Brick,2,9,3,61.50,7.30,Secondary Real Estate
5389809,1600.000000,2021-04-24,2022-03-05 09:29:19,44.839809,38.494939,2843,Brick,3,3,2,53.00,13.00,Secondary Real Estate


### Those are are really expensive apartments. They cost more than 1 billion rubels. If we look at the area or rooms or the building type of the apartments in the dataframe above, we will see that they are not so different with other apartements which have lower prices. I will check if they are outliers.

### I will use IQR method.

In [16]:
Qp11 = df_real['price (Million)'].quantile(0.25)
Qp31 = df_real['price (Million)'].quantile(0.75)
IQR1 = Qp31-Qp11

In [17]:
df_real[df_real['price (Million)']>(Qp31+(1.5*IQR1))]

Unnamed: 0,price (Million),date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
29,11.50000,2018-04-03,2022-03-05 17:40:53,55.798921,37.738090,3,Panel,14,16,3,64.9,8.0,Secondary Real Estate
34,10.20000,2018-04-07,2022-03-05 15:23:20,55.655307,37.614605,3,Monolithic,7,14,2,52.8,10.0,Secondary Real Estate
44,11.50000,2018-04-18,2022-03-05 21:52:41,55.701330,37.507412,3,Panel,14,17,1,42.0,12.0,Secondary Real Estate
56,20.90000,2018-05-01,2022-03-05 13:33:09,55.730785,37.631424,3,Brick,6,10,3,80.0,8.2,Secondary Real Estate
59,12.00000,2018-05-12,2022-03-05 21:08:19,55.627136,37.591736,3,Panel,4,9,3,64.9,7.0,Secondary Real Estate
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5476999,12.85000,2021-05-01,2022-03-05 20:13:47,55.701280,37.642654,3,Monolithic,12,24,1,41.0,9.0,Secondary Real Estate
5477001,19.73976,2021-05-01,2022-03-05 20:13:58,55.804736,37.750898,3,Panel,8,17,4,93.2,13.8,Other(Unknown)
5477002,12.50316,2021-05-01,2022-03-05 20:14:01,55.841415,37.489624,3,Monolithic,17,32,2,45.9,6.6,Other(Unknown)
5477004,11.83191,2021-05-01,2022-03-05 20:14:12,55.804736,37.750898,3,Panel,8,33,2,52.1,18.9,Other(Unknown)


In [18]:
df_real[df_real['price (Million)']<(Qp11-(1.5*IQR1))]

Unnamed: 0,price (Million),date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type


### According to IQR Method, there are more than 400 thousand outliers data. 

### Because we have many outliers data, i think it will be good if i drop only the outliers which have price values more than their average. I will see what is their price average and i will drop the data which have price values more than that average.

In [19]:
df_outlier = df_real[df_real['price (Million)']>(Qp31+(1.5*IQR1))]
df_outlier['price (Million)'].mean()

19.274886411002015

In [28]:
df_outlier.groupby(['building_type'])['building_type'].agg('count')

building_type
Blocky          6777
Brick         114653
Monolithic    211144
Other          26738
Panel          72759
Wooden           380
Name: building_type, dtype: int64

### The average price value of our outliers is 19.27 million rubels. From our outliers. monolithic building type dominates the outliers. I will drop the data which have price values more than around 19 million rubels.

In [20]:
df_real = df_real[df_real['price (Million)']<19]
df_real.shape

(5388250, 13)

### But after checking in the internet, the average cost of apartment of real estate in Russia is about 5000-100000 Poundsterling or more than 500 thousand rubels.

In [40]:
df_real[df_real['price (Million)']<0.5]

Unnamed: 0,price (Million),date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
1514,0.3000,2018-09-08,2022-03-05 04:59:20,52.561070,85.234409,6817,Wooden,1,1,1,26.0,7.0,Secondary Real Estate
1596,0.4800,2018-09-08,2022-03-05 06:43:33,51.523402,81.224751,6817,Panel,1,5,1,31.0,6.0,Secondary Real Estate
1916,0.2500,2018-09-08,2022-03-05 09:08:31,48.046666,40.125644,3230,Panel,1,5,1,21.0,6.0,Secondary Real Estate
2234,0.4550,2018-09-08,2022-03-05 10:21:48,58.270120,59.706687,6171,Brick,1,2,2,49.9,3.0,Secondary Real Estate
2241,0.4500,2018-09-08,2022-03-05 10:23:33,45.508422,38.083928,2843,Panel,2,2,1,45.0,12.0,Secondary Real Estate
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5474635,0.3200,2021-05-01,2022-03-05 16:18:49,55.784830,42.628058,2871,Brick,2,2,2,45.0,6.0,Secondary Real Estate
5476102,0.0063,2021-05-01,2022-03-05 18:35:48,55.926270,37.723278,81,Monolithic,15,17,1,33.0,9.4,Secondary Real Estate
5476156,0.0110,2021-05-01,2022-03-05 18:42:38,48.747428,44.509038,4695,Monolithic,18,18,1,38.0,11.0,Secondary Real Estate
5476168,0.3500,2021-05-01,2022-03-05 18:44:43,63.845328,57.299018,4417,Panel,2,5,3,59.0,6.0,Secondary Real Estate


### There more than 17 thousand data which have price values less than 500 K rubels. They even have prices around 10 rubels, which don't make sense. I will drop them. 

# 3. Exploratory Data Analysis

## 3.1 Basic Exploration

### Here we see average, maximum, and minimum prices of our data in each building type

In [21]:
df_real.groupby(['building_type'])['price (Million)'].agg(['mean','max','min'])

Unnamed: 0_level_0,mean,max,min
building_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blocky,3.366753,18.999,1e-06
Brick,3.569716,18.999999,1e-06
Monolithic,5.542573,18.999999,1e-06
Other,4.438209,18.999,0.004474
Panel,3.15195,18.999999,1e-06
Wooden,1.775368,18.999,1e-06


In [32]:
df_real[df_real['price (Million)']<0.00005]

Unnamed: 0,price (Million),date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
278888,1e-06,2018-10-16,2022-03-05 08:45:44,67.496464,63.729965,4417,Panel,3,5,2,46.0,6.0,Secondary Real Estate
1151032,1e-06,2019-02-14,2022-03-05 19:56:25,57.649913,63.068436,6171,Panel,1,5,1,37.0,5.0,Secondary Real Estate
1203872,1e-06,2019-02-22,2022-03-05 10:38:34,44.912524,40.604708,2843,Brick,1,2,3,70.0,15.0,Secondary Real Estate
1247135,1e-05,2019-02-27,2022-03-05 17:15:30,58.373542,58.332911,5520,Blocky,4,5,3,54.0,6.0,Secondary Real Estate
1378030,1e-06,2019-03-16,2022-03-05 02:47:55,45.015628,37.756383,2843,Brick,2,2,3,120.0,10.0,Secondary Real Estate
1555136,1e-06,2019-04-07,2022-03-05 19:24:11,54.270975,42.885338,5241,Wooden,1,1,4,90.0,10.0,Secondary Real Estate
1569167,1e-05,2019-04-09,2022-03-05 09:36:09,56.137265,38.431157,81,Monolithic,5,7,2,58.0,9.0,Secondary Real Estate
1635596,1e-06,2019-04-18,2022-03-05 10:01:36,56.179027,50.398585,2922,Panel,1,3,2,37.3,6.0,Secondary Real Estate
1763286,1e-06,2019-05-09,2022-03-05 06:50:18,51.959486,116.585325,10160,Brick,1,2,2,54.0,9.0,Secondary Real Estate
1969455,1e-06,2019-06-10,2022-03-05 15:59:19,43.7618,43.984623,9648,Panel,2,5,2,52.0,9.0,Secondary Real Estate


In [29]:
0.00001*1000000

10.0

In [23]:
df_real.groupby(['object_type'])['price (Million)'].mean()

object_type
Other(Unknown)           4.096926
Secondary Real Estate    3.747408
Name: price (Million), dtype: float64

## Next task : Remove negative values for price feature