In [1]:
from pymongo import MongoClient

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from collections import defaultdict

In [5]:
mongo = MongoClient()

In [6]:
db = mongo.declarator

In [7]:
db.collection_names()

['declarations']

In [8]:
from flat_price import FlatPricing

In [9]:
flat_pricing = FlatPricing()

In [10]:
milonov = list(db.declarations.find({
    'main.person.name': {'$regex': "Милонов"},
    'main.document_type.id': 1
}))

In [11]:
from collections import abc
def nested_dict_iter(nested, path):
    for key, value in nested.items():
        if isinstance(value, abc.Mapping):
            yield key, value, path
            yield from nested_dict_iter(value, path + [key])
        else:
            yield key, value, path

In [12]:
def get_possible_regions(person):
    for main in person:
        for key, val, path in nested_dict_iter(main, []):
            if key == 'region':
                if val:
                    yield val, path

In [13]:
def estimate_item(item, possible_regions):
    item_region = item.get('region')
    if not item_region:
        item_region = possible_regions
    else:
        item_region = [item_region['name']]
        
    item_type = item['type']['name']
    coef, depend_on_square = flat_pricing.price_coef[item_type]
    
    mean_square = flat_pricing.median_sqrs.get(item_type, 1)
    
    if depend_on_square:
        square = item.get('square') or mean_square
    else:
        square = 1
        
    share = item.get('share') or 1.0
        
    prices = []
    for region in item_region:
        estimated_price = int(flat_pricing.get_price(region, year) * square * coef * share)
        prices.append(estimated_price)
    mean_price = np.mean(prices)
    
    return mean_price

In [14]:
skip_own_types = {
    'Безвозмездное пользование',
    'Служебное жилье',
    'Наём (аренда)',
    'Фактическое предоставление',
    'В пользовании'
}

In [15]:
def estimate_all_real_estates(person, possible_regions):
    year = person['main']['year']
    
    if not possible_regions:
        possible_regions = [None]

    prop = 'real_estates'
    real_estates = person.get(prop) or []
    
    res = res = {
        prop + '_self_price': 0,
        prop + '_rel_price': 0
    }
    
    for item in real_estates:

        if item.get('own_type', {}).get('name') in skip_own_types:
            continue
        
        price = estimate_item(item, possible_regions)
        if not item.get('relative'):
            res[prop + '_self_price'] += price
        else:
            res[prop + '_rel_price'] += price
            
    return res

In [16]:
def count_things(person, prop='real_estates'):
    res = {
        prop + '_self_count': 0,
        prop + '_rel_count': 0
    }
    items = person.get(prop) or []
    for item in items:
        if item.get('own_type', {}).get('name') in skip_own_types:
            continue
            
        if not item.get('relative'):
            res[prop + '_self_count'] += 1
        else:
            res[prop + '_rel_count'] += 1
    return res

In [45]:
def count_field(person, prop='incomes', field='size'):
    res = {
        prop + '_self_' + field: 0,
        prop + '_rel_' + field: 0
    }
    items = person.get(prop) or []
    for item in items:
        if item.get('own_type', {}).get('name') in skip_own_types:
            continue
                        
        if not item.get('relative'):
            res[prop + '_self_' + field] += item.get(field, 0) or 0
        else:
            res[prop + '_rel_' + field] += item.get(field, 0) or 0
            
    return res

In [46]:
possible_regions = set(map(lambda x: x[0]['name'], get_possible_regions(milonov)))

In [47]:
declarations_by_year = sorted(milonov, key=lambda x: x['main']['year'])

trends = []
for declaration in declarations_by_year:
    year = declaration['main']['year']
    price = estimate_all_real_estates(declaration, possible_regions)
    counts_estates = count_things(declaration, 'real_estates')
    counts_vehicles = count_things(declaration, 'vehicles')
    incomes = count_field(declaration)
    
    sqrs = count_field(declaration, 'real_estates', 'square')
    
    features = {**price, **counts_estates, **counts_vehicles, **incomes, **sqrs, "year": year}
    
    trends.append(features)
    print(year, declaration['main']['document_type']['id'])

2010 1
2011 1
2012 1
2013 1
2014 1
2015 1
2016 1


In [48]:
count_field(declaration, 'real_estates', 'square')

{'real_estates_rel_square': 72.7, 'real_estates_self_square': 2003.01}

In [49]:
diffs = []
for prev, follow in zip(trends[:-1], trends[1:]):
    all_keys = set(list(prev.keys()) + list(follow.keys()))
    diff = {}
    for key in all_keys:
        diff[key] = follow.get(key, 0) - prev.get(key, 0)
    diffs.append(diff)   
        

In [51]:
df_diffs = pd.DataFrame(diffs)

In [53]:
timeline_features = df_diffs.describe().loc[['min', 'max']].to_dict()

In [54]:
timeline_features_flatten = {}
for key, value, path in nested_dict_iter(timeline_features, []):
    if isinstance(value, dict):
        continue
    flatten_key = "_".join(path) + "_" + key
    timeline_features_flatten[flatten_key] = value

In [55]:
timeline_features_flatten

{'incomes_rel_size_max': 8100000.0,
 'incomes_rel_size_min': -8126363.609999999,
 'incomes_self_size_max': 1500720.51,
 'incomes_self_size_min': -1496937.69,
 'real_estates_rel_count_max': 1.0,
 'real_estates_rel_count_min': -3.0,
 'real_estates_rel_price_max': 19586823.0,
 'real_estates_rel_price_min': -19849245.0,
 'real_estates_rel_square_max': 107.60000000000001,
 'real_estates_rel_square_min': -958.5999999999999,
 'real_estates_self_count_max': 1.0,
 'real_estates_self_count_min': -1.0,
 'real_estates_self_price_max': 15819453.0,
 'real_estates_self_price_min': -14990669.0,
 'real_estates_self_square_max': 107.59999999999991,
 'real_estates_self_square_min': -107.59999999999991,
 'vehicles_rel_count_max': 0.0,
 'vehicles_rel_count_min': 0.0,
 'vehicles_self_count_max': 2.0,
 'vehicles_self_count_min': -1.0,
 'year_max': 1.0,
 'year_min': 1.0}

In [57]:
timeline_features_flatten['regions'] = possible_regions

In [58]:
timeline_features_flatten

{'incomes_rel_size_max': 8100000.0,
 'incomes_rel_size_min': -8126363.609999999,
 'incomes_self_size_max': 1500720.51,
 'incomes_self_size_min': -1496937.69,
 'real_estates_rel_count_max': 1.0,
 'real_estates_rel_count_min': -3.0,
 'real_estates_rel_price_max': 19586823.0,
 'real_estates_rel_price_min': -19849245.0,
 'real_estates_rel_square_max': 107.60000000000001,
 'real_estates_rel_square_min': -958.5999999999999,
 'real_estates_self_count_max': 1.0,
 'real_estates_self_count_min': -1.0,
 'real_estates_self_price_max': 15819453.0,
 'real_estates_self_price_min': -14990669.0,
 'real_estates_self_square_max': 107.59999999999991,
 'real_estates_self_square_min': -107.59999999999991,
 'regions': {'Санкт-Петербург'},
 'vehicles_rel_count_max': 0.0,
 'vehicles_rel_count_min': 0.0,
 'vehicles_self_count_max': 2.0,
 'vehicles_self_count_min': -1.0,
 'year_max': 1.0,
 'year_min': 1.0}