# <center> Модель прогнозирования стоимости жилья для агентства недвижимости

Импорт библиотек

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, SGDRegressor

import warnings
warnings.filterwarnings('ignore')

plt.style.use('bmh')

## Данные

Читаем данные

In [2]:
data = pd.read_csv('data/data.csv')

display(data.head())
print(f'{data.shape[0]} rows, {data.shape[1]} columns')

Unnamed: 0,status,private pool,propertyType,street,baths,homeFacts,fireplace,city,schools,sqft,zipcode,beds,state,stories,mls-id,PrivatePool,MlsId,target
0,Active,,Single Family Home,240 Heather Ln,3.5,"{'atAGlanceFacts': [{'factValue': '2019', 'fac...",Gas Logs,Southern Pines,"[{'rating': ['4', '4', '7', 'NR', '4', '7', 'N...",2900,28387,4,NC,,,,611019,"$418,000"
1,for sale,,single-family home,12911 E Heroy Ave,3 Baths,"{'atAGlanceFacts': [{'factValue': '2019', 'fac...",,Spokane Valley,"[{'rating': ['4/10', 'None/10', '4/10'], 'data...","1,947 sqft",99216,3 Beds,WA,2.0,,,201916904,"$310,000"
2,for sale,,single-family home,2005 Westridge Rd,2 Baths,"{'atAGlanceFacts': [{'factValue': '1961', 'fac...",yes,Los Angeles,"[{'rating': ['8/10', '4/10', '8/10'], 'data': ...","3,000 sqft",90049,3 Beds,CA,1.0,,yes,FR19221027,"$2,895,000"
3,for sale,,single-family home,4311 Livingston Ave,8 Baths,"{'atAGlanceFacts': [{'factValue': '2006', 'fac...",yes,Dallas,"[{'rating': ['9/10', '9/10', '10/10', '9/10'],...","6,457 sqft",75205,5 Beds,TX,3.0,,,14191809,"$2,395,000"
4,for sale,,lot/land,1524 Kiscoe St,,"{'atAGlanceFacts': [{'factValue': '', 'factLab...",,Palm Bay,"[{'rating': ['4/10', '5/10', '5/10'], 'data': ...",,32908,,FL,,,,861745,"$5,000"


377185 rows, 18 columns


Основная информация о данных

In [3]:
def get_data_info(data):
    data_info = pd.DataFrame({'Nulls': (data.isna().sum() * 100/data.shape[0]).round(2), 
                            'Uniques': data.nunique(dropna=False),
                            'Dtypes': data.dtypes})

    return data_info

get_data_info(data)

Unnamed: 0,Nulls,Uniques,Dtypes
status,10.58,160,object
private pool,98.89,2,object
propertyType,9.21,1281,object
street,0.0,337077,object
baths,28.19,230,object
homeFacts,0.0,321009,object
fireplace,72.66,1653,object
city,0.01,2027,object
schools,0.0,297365,object
sqft,10.76,25406,object


In [4]:
data['MlsId'] = data['MlsId'].fillna('') + data['mls-id'].fillna('')
data['MlsId'] = data['MlsId'].apply(lambda x: np.nan if x == '' else x)

t_col = data.shape[0] * 0.7
data.dropna(axis=1, thresh=t_col, inplace=True)

t_row = data.shape[1] * 0.7
data.dropna(axis=0, thresh=t_row, inplace=True)

data.reset_index(drop=True, inplace=True)

get_data_info(data)

Unnamed: 0,Nulls,Uniques,Dtypes
status,10.18,159,object
propertyType,5.13,1281,object
street,0.0,321390,object
baths,24.81,230,object
homeFacts,0.0,317304,object
city,0.01,2000,object
schools,0.0,286152,object
sqft,6.62,25397,object
zipcode,0.0,4499,object
beds,21.57,193,object


## Обработка признаков

### Цена недвижимости (целевой признак) 

In [5]:
data['target'][:50]

0       $418,000
1       $310,000
2     $2,895,000
3     $2,395,000
4         $5,000
5       $209,000
6        181,500
7       $244,900
8       $311,995
9       $669,000
10       260,000
11      $525,000
12      $499,900
13      $168,800
14     1,650,000
15       335,000
16     2,650,000
17      $365,000
18      $626,000
19      $375,000
20    $3,500,000
21       579,000
22      $499,007
23      $182,000
24    $3,749,000
25       799,000
26      $499,900
27      $559,000
28      $830,000
29    $1,195,000
30      $262,000
31      $204,800
32      $179,000
33      $105,000
34      $260,000
35      $284,900
36      $495,000
37      $117,900
38      $385,000
39    $1,100,000
40      $620,000
41      $125,000
42      $499,000
43    $1,429,000
44     $233,990+
45      $275,000
46       $27,000
47      $598,000
48    $1,780,000
49      $490,000
Name: target, dtype: object

In [6]:
data['target'].fillna('$0', inplace=True)

y = (data['target']
     .apply(lambda x: x[1:] if x[0] == '$' else x)
     .apply(lambda x: x.split('/')[0])
     .apply(lambda x: x[:-1] if x[-1] == '+' else x) 
     .apply(lambda x: x.replace(',', ''))
     .apply(lambda x: '0' if x == '1215 - $1437' else x)
     .astype(int))

y

0          418000
1          310000
2         2895000
3         2395000
4            5000
           ...   
360187     799000
360188    1249000
360189     674999
360190     528000
360191     204900
Name: target, Length: 360192, dtype: int32

In [7]:
better_data = pd.DataFrame(index=data.index)

better_data

0
1
2
3
4
...
360187
360188
360189
360190
360191


### Статус продажи

In [8]:
def fix_feature(feature, 
                filling='Unknown',
                lower=True):
    
    x = feature.fillna(str(filling))
    
    if lower:
        x = x.apply(lambda x: x.lower())
    
    return x

homefacts = data['homeFacts']

for col in data.columns:
    data[col] = fix_feature(data[col])

In [9]:
get_populars = lambda data, first=100: (data
                                        .value_counts(dropna=False)[:first]
                                        .index
                                        .to_list())

get_populars(data['status'])

['for sale',
 'active',
 'unknown',
 'foreclosure',
 'new construction',
 'pending',
 'pre-foreclosure',
 'p',
 'pre-foreclosure / auction',
 'under contract show',
 ' / auction',
 'under contract   showing',
 'active under contract',
 'under contract',
 'new',
 'contingent',
 'price change',
 'auction',
 'a active',
 'for rent',
 'foreclosed',
 'under contract backups',
 'contingent finance and inspection',
 'recently sold',
 'pending continue to show',
 'option pending',
 'back on market',
 'contingent show',
 'pending taking backups',
 'option contract',
 'active with offer',
 'pending with contingencies',
 'active backup',
 'c',
 'contract p',
 'pi',
 'p pending sale',
 'listing extended',
 'auction - active',
 'due diligence period',
 'contract contingent on buyer sale',
 'c continue show',
 'pending - taking backups',
 'offer pending signature',
 'active/contingent',
 'pending inspection',
 'contingent take backup',
 'contingent   show',
 'pending in',
 'active with contingencies

In [10]:
statuses = ['for sale', 'coming soon', 'new', 
            'foreclosure', 'pre-foreclosure', 
            'auction', 'active', 'show', 
            'for rent', 'sold', 'contract', 
            'contingent', 'back', 'unknown']

better_data['status pending'] = (data['status']
                                 .apply(lambda x: 1 if (('pending' in x) or 
                                                        (x == 'p') or 
                                                        (len(x) == 2 and 'p' in x)) 
                                                    else 0)
                                 .astype('uint8'))

for status in statuses:
    better_data['status '+status] = (data['status']
                                     .apply(lambda x: 1 if status in x else 0)
                                     .astype('uint8'))


better_data['status other'] = (better_data.sum(axis=1)
                               .apply(lambda x: 1 if x == 0 else 0)
                               .astype('uint8'))

better_data.sum()

status pending              6878
status for sale           199477
status coming soon           110
status new                  6153
status foreclosure         10109
status pre-foreclosure      3529
status auction              2954
status active              93439
status show                 2329
status for rent              412
status sold                  240
status contract             3802
status contingent           1023
status back                  512
status unknown             36678
status other                1059
dtype: int64

### Тип недвижимости

In [11]:
get_populars(data['propertyType'])

['single-family home',
 'single family',
 'condo',
 'single family home',
 'lot/land',
 'unknown',
 'townhouse',
 'land',
 'multi-family',
 'condo/townhome/row home/co-op',
 'traditional',
 'coop',
 'multi family',
 'high rise',
 'ranch',
 'mobile/manufactured',
 'detached, one story',
 'single detached, traditional',
 'contemporary',
 'multi-family home',
 '1 story',
 'colonial',
 'mobile / manufactured',
 'contemporary/modern',
 '2 stories',
 'apartment',
 'mfd/mobile home',
 'single detached',
 'detached, two story',
 'one story',
 'transitional',
 'cooperative',
 'florida',
 'bungalow',
 'farms/ranches',
 'two story',
 'garden home',
 '2 story',
 'cape cod',
 '2 stories, traditional',
 'spanish/mediterranean',
 'other style',
 '1 story, traditional',
 'other',
 'condo/townhome, traditional',
 'craftsman',
 'contemporary/modern, traditional',
 'condo/townhome',
 'singlefamilyresidence',
 'condo/townhome, contemporary/modern',
 'single detached, contemporary/modern',
 'multiple occup

In [12]:
proptypes = ['single', 'family', 'condo', 
             'land', 'town', 'multi', 
             'coop', 'traditional', 'ranch', 
             'mobile', 'detached', 'contemporary', 
             'modern', 'apartment', 'story', 
             'stories', 'mediterranean', 'transitional']

for prop in proptypes:
    better_data['property type '+prop] = (data['propertyType']
                                         .apply(lambda x: 1 if prop in x else 0)
                                         .astype('uint8'))

better_data['property type unknown'] = (data['propertyType']
                                       .apply(lambda x: 1 if x in ('unknown', ' ') else 0)
                                       .astype('uint8'))


def get_other(data, col_name, return_cols=False):
    named_cols = list(filter(lambda x: col_name in x, data.columns))

    if return_cols:
        return named_cols
    
    return (data.loc[:, named_cols]
            .sum(axis=1)
            .apply(lambda x: 1 if x == 0 else 0)
            .astype('uint8'))

better_data['property type other'] = get_other(better_data, 'property')


get_sums = lambda data, name: data.loc[:, get_other(better_data, name, True)].sum()

get_sums(better_data, 'property')

property type single           189952
property type family           198719
property type condo             51422
property type land              31458
property type town              27104
property type multi             12485
property type coop               3751
property type traditional        9679
property type ranch              3193
property type mobile             3491
property type detached           5822
property type contemporary       4436
property type modern             2336
property type apartment           910
property type story              5913
property type stories            1745
property type mediterranean       875
property type transitional        780
property type unknown           18617
property type other              8367
dtype: int64

### Количество ванных

In [13]:
get_populars(data['baths'])

['unknown',
 '2 baths',
 '3 baths',
 '2',
 '2.0',
 '4 baths',
 '3.0',
 '3',
 'bathrooms: 2',
 '2.5',
 'bathrooms: 3',
 '1',
 '1.0',
 '5 baths',
 '4.0',
 '2.5 baths',
 '0',
 '4',
 '3.5',
 'bathrooms: 1',
 '2 ba',
 '6 baths',
 'bathrooms: 4',
 '1,500',
 '3 ba',
 '1.5',
 '3.5 baths',
 '2,000',
 '1,750',
 '3,000',
 '5.0',
 '2,250',
 '1,000',
 '5',
 '7 baths',
 '1,250',
 '4.5',
 '2,750',
 '2,500',
 'bathrooms: 5',
 '~',
 '1 ba',
 '6.0',
 '4 ba',
 '3,500',
 '8 baths',
 '-- baths',
 '6',
 '4.5 baths',
 '7.0',
 'bathrooms: 6',
 '9 baths',
 '5.5',
 '7',
 '4,000',
 '5 ba',
 '2.5+',
 '1.5 baths',
 '750',
 '5,000',
 '8.0',
 '3.5+',
 'sq. ft. ',
 '10 baths',
 '8',
 '4.5+',
 'bathrooms: 7',
 '5.5+',
 '6.5',
 '9.0',
 '6 ba',
 '11 baths',
 '1.75 baths',
 'bathrooms: 8',
 '5.5 baths',
 '12 baths',
 '9',
 '0 / 0',
 '6.5+',
 '7 ba',
 '1.5+',
 '10.0',
 '10',
 '2.1 baths',
 '2.5 ba',
 '13 baths',
 '11',
 '8 ba',
 '2.75 baths',
 'bathrooms: 9',
 '16 baths',
 '7.5',
 '7.5+',
 '11.0',
 '14 baths',
 'bathrooms

In [14]:
def get_baths(x):
    global q
    
    xsplit = x.split()
    if xsplit[0] == 'Bathrooms:':
        q = xsplit[1]
    
    x0_split = xsplit[0].split(',')
    if (len(x0_split) == 2):
        q = x0_split[0]
    
    q = xsplit[0].split('.')[0]
    
    if not q.isnumeric():
        return '-1'
    else:
        return q


better_data['baths'] = data['baths'].apply(get_baths).astype(int)

better_data['baths']

0         3
1         3
2         2
3         8
4        -1
         ..
360187    3
360188    6
360189    3
360190    3
360191    2
Name: baths, Length: 360192, dtype: int32

### Количество спален

In [15]:
get_populars(data['beds'])

['unknown',
 '3 beds',
 '4 beds',
 '3',
 '2 beds',
 '4',
 '2',
 'baths',
 '3 bd',
 '5 beds',
 '4 bd',
 '3.0',
 '5',
 '4.0',
 '2 bd',
 '1',
 '6 beds',
 '5 bd',
 '2.0',
 '6',
 '5.0',
 '0',
 '7 beds',
 '1 bd',
 '-- bd',
 '8 beds',
 'bath',
 '6 bd',
 '7',
 '1.0',
 '9 beds',
 ' ',
 '8',
 '6.0',
 '0.0',
 '10 beds',
 '7 bd',
 '12 beds',
 '8 bd',
 '9',
 '11 beds',
 '7.0',
 '10',
 '64 beds',
 '16 beds',
 '8.0',
 '11',
 '9 bd',
 '12',
 '13 beds',
 '14 beds',
 '18 beds',
 '10 bd',
 '15 beds',
 '16',
 '12 bd',
 '24 beds',
 '14',
 '13',
 '20 beds',
 '10.0',
 '24',
 '11 bd',
 '15',
 '20',
 '9.0',
 '17 beds',
 '18',
 '32 beds',
 '28 beds',
 '16 bd',
 '22 beds',
 '26 beds',
 '24 bd',
 '13 bd',
 '34 beds',
 '21 beds',
 '19 beds',
 '14 bd',
 '17',
 '28',
 '8,276 sqft',
 '36 beds',
 '11.0',
 '22',
 '25 beds',
 '23 beds',
 '32',
 '5,227 sqft',
 '16.0',
 '42 beds',
 '18 bd',
 '15.0',
 '29',
 '12.0',
 '44 beds',
 '47 beds',
 '40 beds',
 '34',
 '40']

In [16]:
def get_beds(x):
    global q
    
    xsplit = x.split()
    
    if len(xsplit) == 0:
        xsplit = ['-1']
        
    q = xsplit[0].split('.')[0]
    
    if not q.isnumeric():
        return '-1'
    else:
        return q

better_data['beds'] = data['beds'].apply(get_beds).astype(int)

better_data['beds'] 

0         4
1         3
2         3
3         5
4        -1
         ..
360187    2
360188    5
360189    3
360190    3
360191    3
Name: beds, Length: 360192, dtype: int32

### Площадь недвижимости

In [17]:
data['sqft'].value_counts(dropna=False)[-100:].index.tolist()

['total interior livable area: 4,881 sqft',
 '1,916,640 sqft',
 'total interior livable area: 3,585 sqft',
 'total interior livable area: 5,012 sqft',
 '12,426 sqft',
 '60,692 sqft',
 '6,871 sqft',
 'total interior livable area: 5,793 sqft',
 '5029',
 '7,689 sqft',
 '155,155 sqft',
 '6,072 sqft',
 '5568',
 '9,093',
 '7,417',
 '13,461 sqft',
 '52,708',
 '17,181 sqft',
 '4782',
 '9,807 sqft',
 '5,472 sqft',
 'total interior livable area: 6,784 sqft',
 'total interior livable area: 6,842 sqft',
 '163,080 sqft',
 '6650',
 '31,927 sqft',
 '8262',
 '6,009 sqft',
 '7,471',
 '6435',
 '10,544 sqft',
 '5093',
 '7,750 sqft',
 '7,254',
 '6,792 sqft',
 '47,272 sqft',
 '107,988 sqft',
 '10,268 sqft',
 '6,112 sqft',
 '256,750 sqft',
 'total interior livable area: 3,900 sqft',
 'total interior livable area: 7,328 sqft',
 'total interior livable area: 3,551 sqft',
 '12,031 sqft',
 '7240',
 'total interior livable area: 5,409 sqft',
 '9,722',
 '5,938',
 '5724',
 '7,131',
 '4433',
 '21,360',
 '11,392',
 

In [18]:
def get_sqft_area(x):
    
    xsplit = x.replace(',', '').split()
    q = xsplit[0]
    
    if q == 'total':
        return xsplit[-2]
    
    if not q.isnumeric():
        return '-1'
    else:
        return q
    
better_data['sqft area'] = (data['sqft']
                            .apply(get_sqft_area)
                            .astype(float)
                            .apply(lambda x: x / 1000 if x > 0 else x))

better_data['sqft area']

0         2.900
1         1.947
2         3.000
3         6.457
4        -1.000
          ...  
360187    1.417
360188    4.017
360189    2.000
360190    1.152
360191    1.462
Name: sqft area, Length: 360192, dtype: float64

### Факты о недвижимости

In [19]:
homefacts[:50] 

0     {'atAGlanceFacts': [{'factValue': '2019', 'fac...
1     {'atAGlanceFacts': [{'factValue': '2019', 'fac...
2     {'atAGlanceFacts': [{'factValue': '1961', 'fac...
3     {'atAGlanceFacts': [{'factValue': '2006', 'fac...
4     {'atAGlanceFacts': [{'factValue': '', 'factLab...
5     {'atAGlanceFacts': [{'factValue': '1920', 'fac...
6     {'atAGlanceFacts': [{'factValue': '2006', 'fac...
7     {'atAGlanceFacts': [{'factValue': '1970', 'fac...
8     {'atAGlanceFacts': [{'factValue': '2019', 'fac...
9     {'atAGlanceFacts': [{'factValue': '1965', 'fac...
10    {'atAGlanceFacts': [{'factValue': '2015', 'fac...
11    {'atAGlanceFacts': [{'factValue': '1996', 'fac...
12    {'atAGlanceFacts': [{'factValue': '2019', 'fac...
13    {'atAGlanceFacts': [{'factValue': '1982', 'fac...
14    {'atAGlanceFacts': [{'factValue': '1905', 'fac...
15    {'atAGlanceFacts': [{'factValue': '2008', 'fac...
16    {'atAGlanceFacts': [{'factValue': '1899', 'fac...
17    {'atAGlanceFacts': [{'factValue': '2016', 

In [20]:
import json

homefacts = (homefacts.apply(lambda x: x
                             .replace("'", '"')
                             .replace('""', '"No Data"')
                             .replace('None', '"No Data"')
                             .replace('""No Data""', '"No Data"')
                             .replace('Attch"d', 'Attached')
                             .replace(', "No Data"', ', None')
                             .replace('Req"d', 'Required')
                             .replace('"s', "'s")
                             .replace('"No Data", O', 'None, O')
                             .replace('"closet"', 'Closet')
                             .replace('Addtn"l', 'Additional')
                             .replace(" \'st", ' "st')
                             .replace('"No Data", C', 'None, C')
                             .replace('"No Data", D', 'None, D')))


hf_list = []

for i in data.index:
    hf_json = json.loads(homefacts[i])
    hf_dict = {}
    
    for elem in hf_json['atAGlanceFacts']:
        v_list = list(elem.values())
        
        hf_dict[v_list[1]] = v_list[0]
    
    hf_list.append(hf_dict)
    
hf_df = pd.DataFrame(hf_list)

for col in hf_df.columns:
    hf_df[col] = (fix_feature(hf_df[col], filling='None')
                  .apply(lambda x: np.nan if x == 'no data' else x))


display(hf_df)    
display(get_data_info(hf_df))

Unnamed: 0,Year built,Remodeled year,Heating,Cooling,Parking,lotsize,Price/sqft
0,2019,,"central a/c, heat pump",,,,$144
1,2019,,,,,5828 sqft,$159/sqft
2,1961,1967,forced air,central,attached garage,"8,626 sqft",$965/sqft
3,2006,2006,forced air,central,detached garage,"8,220 sqft",$371/sqft
4,,,,,,"10,019 sqft",
...,...,...,...,...,...,...,...
360187,2010,,forced air,central,1 space,,$564
360188,1990,1990,other,central,2 spaces,"8,500 sqft",$311
360189,1924,,radiant,,,,$337/sqft
360190,1950,1950,other,,2,"1,600 sqft",$458/sqft


Unnamed: 0,Nulls,Uniques,Dtypes
Year built,13.45,229,object
Remodeled year,58.47,152,object
Heating,28.18,1913,object
Cooling,34.83,1432,object
Parking,48.39,3319,object
lotsize,17.62,36184,object
Price/sqft,13.54,6500,object


### Возраст недвижимости

In [21]:
hf_df['Year built'].fillna('-1', inplace=True)
hf_df['Year built'] = hf_df['Year built'].astype(float)

better_data['property age'] = ((2023 - hf_df['Year built'])
                               .apply(lambda x: -1 if x == 2024 else x)
                               .astype(int))

better_data['property age']

0          4
1          4
2         62
3         17
4         -1
          ..
360187    13
360188    33
360189    99
360190    73
360191     4
Name: property age, Length: 360192, dtype: int32

### Год реконструкции

In [22]:
hf_df['Remodeled year'].fillna('2023', inplace=True)
hf_df['Remodeled year'] = hf_df['Remodeled year'].astype(int)

better_data['property remodeled'] = ((2023 - hf_df['Remodeled year'])
                                     .apply(lambda x: 1 if x > 0 else x)
                                     .astype('uint8'))
        
better_data['property remodeled']

0         0
1         0
2         1
3         1
4         0
         ..
360187    0
360188    1
360189    0
360190    1
360191    0
Name: property remodeled, Length: 360192, dtype: uint8

### Отопление

In [23]:
get_populars(hf_df['Heating'])

['forced air',
 nan,
 'other',
 'electric',
 'gas',
 'heat pump',
 'central air',
 'central electric',
 'central',
 'central, electric',
 'baseboard',
 'wall',
 'electric heat',
 'heating system',
 'forced air, heat pump',
 'radiant',
 'central air, ceiling fan(s)',
 'natural gas heat',
 'central furnace',
 'forced air, gas',
 'central electric, zoned',
 'forced air heating',
 ', gas hot air/furnace',
 'gas heat',
 'forced air, natural gas',
 'central heating',
 'central gas',
 'central electric, heat pump',
 ', heat pump - heat',
 'central air conditioning',
 'electric, gas',
 'forced air, other',
 'central, gas',
 'refrigeration',
 'central, electric, heat pump',
 'refrigeration, ceiling fan(s)',
 'oil',
 'central, heat pump',
 'forced air, stove',
 'central air, central heat',
 'natural gas',
 'heat pump(s)',
 ', gas hot air/furnace, multizone heat',
 'central, gas, two or more units',
 'electric, heat pump',
 'baseboard, forced air',
 'central, electric, two or more units',
 'force

In [24]:
heatings = ['forced', 'air', 'none', 
            'electric', 'gas', 'heat pump', 
            'central', 'baseboard', 'wall', 
            'system', 'radiant', 'stove', 
            'zoned', 'refrigeration', 'oil']

hf_df['Heating'].fillna('none', inplace=True)

for label in heatings:
    better_data['heating '+label] = (hf_df['Heating']
                                       .apply(lambda x: 1 if label in x else 0)
                                       .astype('uint8'))

better_data['heating other'] = get_other(better_data, 'heating')


get_sums(better_data, 'heating')

heating forced           140903
heating air              152932
heating none             101504
heating electric          29190
heating gas               18283
heating heat pump         13424
heating central           34542
heating baseboard          4611
heating wall               4510
heating system             2776
heating radiant            1916
heating stove               552
heating zoned              1557
heating refrigeration       586
heating oil                 363
heating other             31204
dtype: int64

### Охлаждение

In [25]:
get_populars(hf_df['Cooling'])

['central',
 nan,
 'central air',
 'has cooling',
 'central electric',
 'wall',
 'central gas',
 'central heating',
 'cooling system',
 'central a/c',
 'other',
 'central a/c (electric), central heat (gas)',
 'central a/c (electric), central heat (electric)',
 'refrigeration',
 'central, electric',
 'electric',
 'evaporative',
 'central air, gas hot air/furnace',
 'refrigeration, ceiling fan(s)',
 'central gas, zoned',
 'central, wall',
 'air conditioning-central',
 '2 or more units, central, electric',
 'has heating',
 'central air, gas (hot air)',
 'central electric, zoned',
 'central air, zoned',
 'solar, refrigeration',
 'central, other',
 'ceiling fan, central electric',
 'electric heating',
 'central cooling',
 'heat pump',
 'gas heating',
 'central a/c (electric), central heat (gas), zoned',
 'central electric, heat pump',
 'gas heating, forced air heating',
 'heat pump - ac',
 'central air, gas hot air/furnace, multizone a/c',
 'central electric, central gas',
 'forced air heat

In [26]:
coolings = heatings + ['has', 'a/c', 'evaporative', 
                       'fan', 'conditioning', 'solar']

hf_df['Cooling'].fillna('none', inplace=True)

for label in coolings:
    better_data['cooling '+label] = (hf_df['Cooling']
                                     .apply(lambda x: 1 if label in x else 0)
                                     .astype('uint8'))

better_data['cooling other'] = get_other(better_data, 'cooling')


get_sums(better_data, 'cooling')

cooling forced              806
cooling air               20840
cooling none             125496
cooling electric          15514
cooling gas               10413
cooling heat pump          2670
cooling central          204383
cooling baseboard           292
cooling wall               5941
cooling system             2794
cooling radiant             166
cooling stove                51
cooling zoned              2523
cooling refrigeration      2764
cooling oil                  16
cooling has               10081
cooling a/c                8274
cooling evaporative        1317
cooling fan                2673
cooling conditioning       1482
cooling solar               691
cooling other              3583
dtype: int64

### Парковка

In [27]:
get_populars(hf_df['Parking'])

[nan,
 'attached garage',
 '2 spaces',
 '1 space',
 'detached garage',
 'carport',
 'off street',
 '3 spaces',
 'carport, attached garage',
 '1',
 '4 spaces',
 '2',
 'on street',
 'attached garage, detached garage',
 '0',
 'attached garage, carport',
 'parking desc',
 '6 spaces',
 'detached garage, attached garage',
 'driveway',
 '5 spaces',
 '4',
 'off street parking',
 'off street, attached garage',
 'parking type',
 '3',
 'carport, detached garage',
 'attached garage, garage - 2 car',
 'garage type',
 'parking yn',
 'off street, on street',
 'driveway, garage door opener',
 'garage - 2 car',
 'garage door opener',
 '888',
 'attached garage, off street',
 'off street, detached garage',
 'attached - front',
 'slab parking spaces',
 'detached garage, carport',
 '8 spaces',
 '6',
 'garage-attached',
 'electric door opener',
 'attached - side',
 'attached',
 'on street, off street',
 '7 spaces',
 'electric door opener, slab parking spaces',
 'carport spaces',
 'garage attached',
 'garage

In [28]:
def get_spaces(x):        
    xsplit = x.split()
    
    if (len(xsplit) == 1) and (xsplit[0] == 'none'):
        return 0
    
    spaces_list = list(filter(lambda s: s.isnumeric(), x))
    
    if len(spaces_list) != 0:
        return spaces_list[0]
    
    return 1


hf_df['Parking'].fillna('none', inplace=True)

better_data['p. spaces'] = hf_df['Parking'].apply(get_spaces).astype(int)

better_data['p. spaces']

0         0
1         0
2         1
3         1
4         0
         ..
360187    1
360188    2
360189    0
360190    2
360191    0
Name: p. spaces, Length: 360192, dtype: int32

In [29]:
parkings = ['garage', 'attached', 'detached', 
            'parking', 'carport', 'off street', 
            'on street', 'door opener', 'driveway', 
            'slab', 'assigned', 'open', 'none']

better_data['parking with spaces'] = (hf_df['Parking']
                                      .apply(lambda x: 1 if (('space' in x) or 
                                                              (x[0].isnumeric())) else 0))

for label in parkings:
    better_data['parking '+label] = (hf_df['Parking']
                                     .apply(lambda x: 1 if label in x else 0)
                                     .astype('uint8'))

better_data['parking other'] = get_other(better_data, 'parking')

better_data.drop('parking with spaces', axis=1, inplace=True)


get_sums(better_data, 'parking')

parking garage          99320
parking attached        82977
parking detached        17445
parking parking          6119
parking carport         13812
parking off street       9131
parking on street        3381
parking door opener      2899
parking driveway         3308
parking slab              979
parking assigned         1026
parking open             3324
parking none           174348
parking other             872
dtype: int64

### Площадь участка

In [30]:
get_populars(hf_df['lotsize'])

[nan,
 '—',
 '-- sqft lot',
 '0.26 acres',
 '0.25 acres',
 '0.28 acres',
 '0.27 acres',
 '0.29 acres',
 '0.34 acres',
 '0.31 acres',
 '0.32 acres',
 '6,098 sqft',
 '0.3 acres',
 '0.33 acres',
 '7,405 sqft',
 '0.35 acres',
 '6,534 sqft',
 '4,356 sqft',
 '10,000 sqft',
 '0.46 acres',
 '0.36 acres',
 '5,227 sqft',
 '1 acre',
 '0.37 acres',
 '5,000 sqft',
 '8,712 sqft',
 '3,920 sqft',
 '8,276 sqft',
 '0.38 acres',
 '0.39 acres',
 '0.41 acres',
 '0.5 acres',
 '10019',
 '10,019 sqft',
 '10000',
 '0.42 acres',
 '4,792 sqft',
 '0.43 acres',
 '0.44 acres',
 '0.4 acres',
 '9,583 sqft',
 '4,791 sqft',
 '10,454 sqft',
 '2,500 sqft',
 '0.45 acres',
 '7,840 sqft',
 '3,049 sqft',
 '6,969 sqft',
 '0.48 acres',
 '0.47 acres',
 '7,500 sqft',
 '6,000 sqft',
 '0.30 acres',
 '0.52 acres',
 '2,000 sqft',
 '0.51 acres',
 '6,970 sqft',
 '5,662 sqft',
 '871 sqft',
 '10018 sqft',
 '7405',
 '3,485 sqft',
 '4,000 sqft',
 '0.49 acres',
 '7,841 sqft',
 '0.53 acres',
 '5,663 sqft',
 '10890',
 '7,501 sqft',
 '0.55 ac

In [31]:
hf_df['lotsize'] = (hf_df['lotsize']
                    .fillna('--')
                    .apply(lambda x: 'none' if (('--' in x) or 
                                                ('—' in x)) else x))


def get_lotsize(x):
    xsplit = x.replace(',', '').split()
    
    if xsplit[0] == 'none':
        return -1
    
    if ((('sqft' in xsplit) or 
         ('sq.' in xsplit) or 
         (len(xsplit) == 1) and (xsplit[0] != 'none'))):
        return float(xsplit[0])
    
    if 'acres' in xsplit:
        acres = float(xsplit[0])
        return acres * 43560
    
better_data['lotsize'] = (hf_df['lotsize']
                          .apply(get_lotsize)
                          .astype(float)
                          .apply(lambda x: x / 1000 if x > 0 else x))


better_data['lotsize']

0         -1.000
1          5.828
2          8.626
3          8.220
4         10.019
           ...  
360187    -1.000
360188     8.500
360189    -1.000
360190     1.600
360191     6.969
Name: lotsize, Length: 360192, dtype: float64

### Почтовый индекс

In [32]:
data['zipcode'][:50]

0     28387
1     99216
2     90049
3     75205
4     32908
5     19145
6     34759
7     50401
8     77080
9     11354
10    77068
11    33028
12    97401
13    77084
14    11219
15    33311
16    10027
17    77375
18    33304
19    93552
20    97702
21    33139
22    77386
23    76542
24    20009
25    33183
26    77018
27    33328
28    11238
29    90016
30    33614
31    34952
32    33160
33    32210
34    28202
35    93705
36    33133
37    33610
38    75214
39    92127
40    33332
41    89108
42    91342
43    75709
44    76179
45    85310
46    32164
47    11357
48    33137
49    97221
Name: zipcode, dtype: object

In [33]:
import category_encoders as ce

data['zipcode'] = (data['zipcode']
                   .apply(lambda x: x[:5])
                   .apply(lambda x: -1 if x == '--' else x))

bin_encoder = ce.BinaryEncoder(cols=['zipcode']) # указываем столбец для кодирования
type_bin = bin_encoder.fit_transform(data['zipcode'].astype(int))
better_data = pd.concat([better_data, type_bin], axis=1)

get_sums(better_data, 'zipcode')

zipcode_0        169
zipcode_1      43782
zipcode_2     109228
zipcode_3     144152
zipcode_4     158018
zipcode_5     165527
zipcode_6     173669
zipcode_7     175105
zipcode_8     177460
zipcode_9     180316
zipcode_10    181564
zipcode_11    181189
zipcode_12    183522
dtype: int64

In [34]:
better_data

Unnamed: 0,status pending,status for sale,status coming soon,status new,status foreclosure,status pre-foreclosure,status auction,status active,status show,status for rent,...,zipcode_3,zipcode_4,zipcode_5,zipcode_6,zipcode_7,zipcode_8,zipcode_9,zipcode_10,zipcode_11,zipcode_12
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360187,0,1,0,0,0,0,0,0,0,0,...,1,1,1,1,0,0,1,0,0,0
360188,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,1,0,0,0,1
360189,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
360190,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,1
