# <center> Модель прогнозирования стоимости жилья для агентства недвижимости

Импорт библиотек

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, SGDRegressor

import warnings
warnings.filterwarnings('ignore')

plt.style.use('bmh')

## Данные

Читаем данные

In [2]:
data = pd.read_csv('data/data.csv')

display(data.head())
print(f'{data.shape[0]} rows, {data.shape[1]} columns')

Unnamed: 0,status,private pool,propertyType,street,baths,homeFacts,fireplace,city,schools,sqft,zipcode,beds,state,stories,mls-id,PrivatePool,MlsId,target
0,Active,,Single Family Home,240 Heather Ln,3.5,"{'atAGlanceFacts': [{'factValue': '2019', 'fac...",Gas Logs,Southern Pines,"[{'rating': ['4', '4', '7', 'NR', '4', '7', 'N...",2900,28387,4,NC,,,,611019,"$418,000"
1,for sale,,single-family home,12911 E Heroy Ave,3 Baths,"{'atAGlanceFacts': [{'factValue': '2019', 'fac...",,Spokane Valley,"[{'rating': ['4/10', 'None/10', '4/10'], 'data...","1,947 sqft",99216,3 Beds,WA,2.0,,,201916904,"$310,000"
2,for sale,,single-family home,2005 Westridge Rd,2 Baths,"{'atAGlanceFacts': [{'factValue': '1961', 'fac...",yes,Los Angeles,"[{'rating': ['8/10', '4/10', '8/10'], 'data': ...","3,000 sqft",90049,3 Beds,CA,1.0,,yes,FR19221027,"$2,895,000"
3,for sale,,single-family home,4311 Livingston Ave,8 Baths,"{'atAGlanceFacts': [{'factValue': '2006', 'fac...",yes,Dallas,"[{'rating': ['9/10', '9/10', '10/10', '9/10'],...","6,457 sqft",75205,5 Beds,TX,3.0,,,14191809,"$2,395,000"
4,for sale,,lot/land,1524 Kiscoe St,,"{'atAGlanceFacts': [{'factValue': '', 'factLab...",,Palm Bay,"[{'rating': ['4/10', '5/10', '5/10'], 'data': ...",,32908,,FL,,,,861745,"$5,000"


377185 rows, 18 columns


Основная информация о данных

In [3]:
def get_data_info(data):
    data_info = pd.DataFrame({'Nulls': (data.isna().sum() * 100/data.shape[0]).round(2), 
                            'Uniques': data.nunique(),
                            'Dtypes': data.dtypes})

    return data_info

get_data_info(data)

Unnamed: 0,Nulls,Uniques,Dtypes
status,10.58,159,object
private pool,98.89,1,object
propertyType,9.21,1280,object
street,0.0,337076,object
baths,28.19,229,object
homeFacts,0.0,321009,object
fireplace,72.66,1652,object
city,0.01,2026,object
schools,0.0,297365,object
sqft,10.76,25405,object


In [4]:
data['MlsId'] = data['MlsId'].fillna('') + data['mls-id'].fillna('')
data['MlsId'] = data['MlsId'].apply(lambda x: np.nan if x == '' else x)

t_col = data.shape[0] * 0.7
data.dropna(axis=1, thresh=t_col, inplace=True)

t_row = data.shape[1] * 0.7
data.dropna(axis=0, thresh=t_row, inplace=True)

data.reset_index(inplace=True)

get_data_info(data)

Unnamed: 0,Nulls,Uniques,Dtypes
index,0.0,360192,int64
status,10.18,158,object
propertyType,5.13,1280,object
street,0.0,321389,object
baths,24.81,229,object
homeFacts,0.0,317304,object
city,0.01,1999,object
schools,0.0,286152,object
sqft,6.62,25396,object
zipcode,0.0,4499,object


## Обработка признаков

### Целевой признак 

In [5]:
data['target'].fillna('$0', inplace=True)

y = (data['target']
     .apply(lambda x: x[1:] if x[0] == '$' else x)
     .apply(lambda x: x.split('/')[0])
     .apply(lambda x: x[:-1] if x[-1] == '+' else x) 
     .apply(lambda x: x.replace(',', ''))
     .apply(lambda x: '0' if x == '1215 - $1437' else x)
     .astype(int))

y

0          418000
1          310000
2         2895000
3         2395000
4            5000
           ...   
360187     799000
360188    1249000
360189     674999
360190     528000
360191     204900
Name: target, Length: 360192, dtype: int32

In [6]:
better_data = pd.DataFrame(index=data.index)

better_data

0
1
2
3
4
...
360187
360188
360189
360190
360191


### Признак статуса продажи

In [7]:
data['status'].fillna('Unknown', inplace=True)

data['status'] = data['status'].apply(lambda x: x.lower())

data['status'].value_counts()[:50]

status
for sale                             199477
active                                92092
unknown                               36678
foreclosure                            6578
new construction                       5465
pending                                4791
pre-foreclosure                        2067
p                                      1488
pre-foreclosure / auction              1462
under contract show                    1183
 / auction                              929
under contract   showing                793
active under contract                   718
under contract                          692
new                                     688
contingent                              579
price change                            558
auction                                 526
a active                                443
for rent                                398
foreclosed                              380
under contract backups                  252
contingent finance and in

In [8]:
statuses = ['for sale', 'coming soon', 'new', 
            'foreclosure', 'pre-foreclosure', 
            'auction', 'active', 'show', 
            'for rent', 'sold', 'contract', 
            'contingent', 'back', 'unknown']

better_data['status pending'] = (data['status']
                                 .apply(lambda x: 1 if (('pending' in x) or 
                                                        (x == 'p') or 
                                                        (len(x) == 2 and 'p' in x)) 
                                                    else 0)
                                 .astype('uint8'))

for status in statuses:
    better_data['status '+status] = (data['status']
                                     .apply(lambda x: 1 if status in x else 0)
                                     .astype('uint8'))


better_data['status other'] = (better_data.sum(axis=1)
                               .apply(lambda x: 1 if x == 0 else 0)
                               .astype('uint8'))

better_data.sum()

status pending              6878
status for sale           199477
status coming soon           110
status new                  6153
status foreclosure         10109
status pre-foreclosure      3529
status auction              2954
status active              93439
status show                 2329
status for rent              412
status sold                  240
status contract             3802
status contingent           1023
status back                  512
status unknown             36678
status other                1059
dtype: int64

### Признак типа недвижимости

In [9]:
data['propertyType'].fillna('Unknown', inplace=True)

data['propertyType'] = data['propertyType'].apply(lambda x: x.lower())

data['propertyType'].value_counts()[:50]

propertyType
single-family home                     92033
single family                          62585
condo                                  42470
single family home                     31728
lot/land                               20513
unknown                                18493
townhouse                              18396
land                                   10933
multi-family                            7912
condo/townhome/row home/co-op           7701
traditional                             5913
coop                                    3216
multi family                            2776
high rise                               1823
ranch                                   1781
mobile/manufactured                     1618
detached, one story                     1614
single detached, traditional            1581
contemporary                            1557
multi-family home                       1501
1 story                                 1235
colonial                                12

In [10]:
proptypes = ['single', 'family', 'condo', 
             'land', 'town', 'multi', 
             'coop', 'traditional', 'ranch', 
             'mobile', 'detached', 'contemporary', 
             'modern', 'apartment', 'story', 
             'stories', 'mediterranean', 'transitional']

for prop in proptypes:
    better_data['property type '+prop] = (data['propertyType']
                                         .apply(lambda x: 1 if prop in x else 0)
                                         .astype('uint8'))


better_data['property type unknown'] = (data['propertyType']
                                       .apply(lambda x: 1 if x in ('unknown', ' ') else 0)
                                       .astype('uint8'))

better_data['property type other'] = (better_data.iloc[:, len(statuses)+2:].sum(axis=1)
                                     .apply(lambda x: 1 if x == 0 else 0)
                                     .astype('uint8'))


better_data.sum()

status pending                   6878
status for sale                199477
status coming soon                110
status new                       6153
status foreclosure              10109
status pre-foreclosure           3529
status auction                   2954
status active                   93439
status show                      2329
status for rent                   412
status sold                       240
status contract                  3802
status contingent                1023
status back                       512
status unknown                  36678
status other                     1059
property type single           189952
property type family           198719
property type condo             51422
property type land              31458
property type town              27104
property type multi             12485
property type coop               3751
property type traditional        9679
property type ranch              3193
property type mobile             3491
property typ

### Признак количества ванных

In [11]:
data['baths'].fillna('No data', inplace=True)

data['baths'].value_counts()[:50]

baths
No data         89357
2 Baths         52466
3 Baths         35506
2               20452
2.0             16576
4 Baths         14764
3.0             10869
3               10113
Bathrooms: 2     9538
2.5              8113
Bathrooms: 3     6613
1                6583
1.0              5930
5 Baths          5370
4.0              4593
2.5 Baths        4486
0                3811
4                3567
3.5              3455
Bathrooms: 1     3234
2 ba             2686
6 Baths          2604
Bathrooms: 4     2384
1,500            2208
3 ba             1893
1.5              1857
3.5 Baths        1821
2,000            1792
1,750            1721
3,000            1686
5.0              1557
2,250            1440
1,000            1279
5                1224
7 Baths          1210
1,250            1206
4.5              1016
2,750             965
2,500             894
Bathrooms: 5      823
~                 782
1 ba              756
6.0               737
4 ba              735
3,500             653
8 Ba

In [12]:
def get_baths(x):
    global q
    
    xsplit = x.split()
    if xsplit[0] == 'Bathrooms:':
        q = xsplit[1]
    
    x0_split = xsplit[0].split(',')
    if (len(x0_split) == 2):
        q = x0_split[0]
    
    q = xsplit[0].split('.')[0]
    
    if not q.isnumeric():
        return '-1'
    else:
        return q


better_data['baths'] = data['baths'].apply(get_baths).astype(int)

better_data['baths']

0         3
1         3
2         2
3         8
4        -1
         ..
360187    3
360188    6
360189    3
360190    3
360191    2
Name: baths, Length: 360192, dtype: int32

In [13]:
data['beds'].fillna('No data', inplace=True)

data['beds'].value_counts()[:50]

beds
No data    77690
3 Beds     53458
4 Beds     35417
3          31399
2 Beds     26360
4          20022
2          16107
Baths      15270
3 bd       12832
5 Beds     11270
4 bd        8249
3.0         8088
5           6424
4.0         5231
2 bd        5226
1           4609
6 Beds      3810
5 bd        2633
2.0         2430
6           1806
5.0         1372
0           1207
7 Beds      1100
1 bd        1080
-- bd        905
8 Beds       759
Bath         738
6 bd         653
7            511
1.0          429
9 Beds       347
             327
8            321
6.0          311
0.0          223
10 Beds      176
7 bd         147
12 Beds      130
8 bd          96
9             95
11 Beds       85
7.0           67
10            66
64 Beds       44
16 Beds       41
8.0           41
11            37
9 bd          36
12            34
13 Beds       30
Name: count, dtype: int64

In [15]:
def get_beds(x):
    global q
    
    xsplit = x.split()
    
    if len(xsplit) == 0:
        xsplit = ['-1']
        
    q = xsplit[0].split('.')[0]
    
    if not q.isnumeric():
        return '-1'
    else:
        return q

better_data['beds'] = data['beds'].apply(get_beds).astype(int)

better_data['beds'] 

0         4
1         3
2         3
3         5
4        -1
         ..
360187    2
360188    5
360189    3
360190    3
360191    3
Name: beds, Length: 360192, dtype: int32

In [19]:
better_data

Unnamed: 0,status pending,status for sale,status coming soon,status new,status foreclosure,status pre-foreclosure,status auction,status active,status show,status for rent,...,property type modern,property type apartment,property type story,property type stories,property type mediterranean,property type transitional,property type unknown,property type other,baths,beds
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,3,4
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,3
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,5
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360187,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2
360188,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,5
360189,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
360190,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,3
