# Initialization

In [None]:
%matplotlib inline

In [None]:
# data access
import io
import sqlalchemy as sa

# data handling
import json

# internet
import requests

# data analysis
import numpy as np
import pandas as pd

import scipy
from scipy import stats
import statsmodels.api as sm
#import scikit-learn as sk

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# system packages
from imp import reload

In [None]:
plt.rcParams['figure.figsize'] = [8, 5]
plt.style.use('seaborn')

In [None]:
# my own libs
from libs.gov_eneff import rt

rt.init('config/config.json')
logger = rt.logger

In [None]:
from libs.gov_eneff import db
reload(db)

# Load Data

In [None]:
db = db.DB()

In [None]:
db.load(source='pg')

# 2019-04-28 21:24:41 - DEBUG - Have fetched 44708 records of declarations of buildings.

In [None]:
df_bld = db.df_bld.copy()

## Metadata

In [None]:
declrs_f_seq = pd.Index(pd.read_csv('data/declrs_fields_sequence.txt', header=None)[0])
declr_buildings_f_seq = pd.Index(pd.read_csv('data/declr_buildings_fields_sequence.txt', header=None)[0])

# Attributes

## Declarations

In [None]:
df_declr = db.df_declr.copy()

In [None]:
declrs_f_seq_complemented = declrs_f_seq.append(df_declr.columns[~df_declr.columns.isin(declrs_f_seq)])

print(df_declr.shape)
dft = df_declr\
    .reindex(columns=declrs_f_seq_complemented)\
    .copy()
print(dft.shape)

In [None]:
dft2 = dft.columns.to_frame().merge(db.md_cols.query('table_name == "declrs"')[['column_name', 'column_comment']].set_index('column_name'),
                      left_index=True, right_index=True, how='left')\
    .drop(0, axis=1)

In [None]:
dft2.to_excel('data/local_db/declr_attrs.xlsx')

## Buildings

In [None]:
declr_buildings_f_seq_complemented = declr_buildings_f_seq.append(df_bld.columns[~df_bld.columns.isin(declr_buildings_f_seq)])

print(df_bld.shape)
print(len(declr_buildings_f_seq_complemented))

dft = df_bld\
    .reindex(columns=declr_buildings_f_seq_complemented)\
    .copy()
print(dft.shape)

In [None]:
dft2 = dft.columns.to_frame().merge(db.md_cols.query('table_name == "declr_buildings"')[['column_name', 'column_comment']].set_index('column_name'),
                      left_index=True, right_index=True, how='left')\
    .drop(0, axis=1)

In [None]:
dft2.to_excel('data/local_db/bld_attrs.xlsx')

# Basic Statistics

# Adding Features to Buidlings

## Buildings Coordinates

In [None]:
import ast

df_bld_geo = pd.read_parquet('data/df_bld_geo.parquet')

# converting 'geocoder_response' from string back to dictionary
df_bld_geo['geocoder_response']= df_bld_geo['geocoder_response'].apply(lambda x: ast.literal_eval(x))

In [None]:
#j = df_bld_geo['geocoder_response'].iloc[0]
#print(json.dumps(j, ensure_ascii=False, indent=4))

#print(j['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['Point']['pos'])

In [None]:
# print(j['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['metaDataProperty']['GeocoderMetaData']['text'])
# print(j['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['metaDataProperty']['GeocoderMetaData']['kind'])
# print(j['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['metaDataProperty']['GeocoderMetaData']['precision'])

In [None]:
def get_ya_geo_data(x):
    
    res = {'ya_geo_text': None, 'ya_geo_kind': None, 'ya_geo_area2': None, 'ya_geo_precision': None, 'ya_geo_pos': None}
    
    try:
        geo_obj = x['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']
        res['ya_geo_text']      = geo_obj['metaDataProperty']['GeocoderMetaData']['text']
        res['ya_geo_kind']      = geo_obj['metaDataProperty']['GeocoderMetaData']['kind']
        try:
            res['ya_geo_area2']     = geo_obj['metaDataProperty']['GeocoderMetaData']['AddressDetails']['Country']['AdministrativeArea']['SubAdministrativeArea']['SubAdministrativeAreaName']
        except KeyError:
            res['ya_geo_area2'] = None
        res['ya_geo_precision'] = geo_obj['metaDataProperty']['GeocoderMetaData']['precision']        
        res['ya_geo_pos']       = geo_obj['Point']['pos']
    except IndexError:
        geo_data = None
        
    return(res)

# this is very ineffective, need to improve it later
for col_name in ('ya_geo_text', 'ya_geo_kind', 'ya_geo_area2', 'ya_geo_precision', 'ya_geo_pos'):
    df_bld_geo[col_name] = df_bld_geo['geocoder_response'].apply(lambda x: get_ya_geo_data(x)[col_name])

In [None]:
df_bld = df_bld.merge(df_bld_geo[['actual_address', 'ya_geo_text', 'ya_geo_kind', 'ya_geo_area2', 'ya_geo_precision', 'ya_geo_pos']].set_index('actual_address'),
                              left_on='actual_address', right_index=True, how='left')

## DaData Info (OKVED etc)

In [None]:
df_dadata_orgs2 = pd.read_excel('data/orgs_dadata.xlsx', dtype={'inn': 'object'}).set_index('inn')

In [None]:
df_bld = df_bld.merge(df_dadata_orgs2[['okved', 'okved_type', 
                                             'area_type', 'area', 
                                             'city_type', 'city',
                                             'geo_lat', 'geo_lon', 'geo_qc']]\
                           .rename(columns={'okved': 'dd_okved', 
                                            'okved_type': 'dd_okved_type', 
                                            'area': 'dd_area', 
                                            'area_type': 'dd_area_type',
                                            'city': 'dd_city', 
                                            'city_type': 'dd_city_type',
                                            'geo_lat': 'dd_geo_lat', 
                                            'geo_lon': 'dd_geo_lon', 
                                            'geo_qc': 'dd_geo_qc'}), 
                     left_on='inn',
                     right_index=True,
                     how='left').copy()

In [None]:
df_bld['main_okved_code_l1'] = df_bld['main_okved_code'].str.extract('([0-9]+?)\.').fillna('')
df_bld['main_okved_code_l2'] = df_bld['main_okved_code'].str.extract('([0-9]+?\.[0-9]+?)\.').fillna('')

df_bld['dd_okved_code_l1']   = df_bld['dd_okved'].str.extract('([0-9]+?)\.').fillna('')
df_bld['dd_okved_code_l2']   = df_bld['dd_okved'].str.extract('([0-9]+?\.[0-9]+?)\.').fillna('')

## Climate Zone

In [None]:
df_bld['dd_area_type'].value_counts()

In [None]:
df_bld['dd_city_type'].value_counts()

In [None]:
df_bld['dd_area'].value_counts()

In [None]:
df_bld['ya_geo_precision'].value_counts()

In [None]:
df_bld['ya_geo_pos'].isna().mean()

In [None]:
def get_climate_zone(x, source):
    """
    1 - 'North'
    2 - 'Middle'
    3 = 'South'
    """
    
    cz = None
    
    if source=='yandex':
    
        if pd.isna(x['ya_geo_pos']):
            cz = 2
        else:
            #print(x['ya_geo_pos'])
            lattitude = x['ya_geo_pos'].split(' ')[1]
            #print(lattitude)
            lattitude = float(lattitude)
            if lattitude > 56:
                cz = 1
            elif lattitude < 55:
                cz = 3
            else:
                cz = 2

    elif source == 'dadata':
        
        if pd.isna(x['dd_geo_lat']):
            cz = 2
        else:
            #print(x['ya_geo_pos'])
            lattitude = x['dd_geo_lat']
            #print(lattitude)
            lattitude = float(lattitude)
            if lattitude > 56:
                cz = 1
            elif lattitude < 55:
                cz = 3
            else:
                cz = 2
        
    else:
        
        raise ValueError
                
    return(cz)
        

In [None]:
df_bld['ya_climate_zone'] = df_bld.apply(lambda x: get_climate_zone(x, source='yandex'), axis=1)

In [None]:
df_bld['dd_climate_zone'] = df_bld.apply(lambda x: get_climate_zone(x, source='dadata'), axis=1)

In [None]:
(df_bld['ya_climate_zone'] == df_bld['dd_climate_zone']).mean()

# Export Data

In [None]:
db.df_bld = df_bld.copy()

In [None]:
db.save(dest='local')

# ### Appendix A

# External Sources - Meant to Be Run Once / Skip if Uncertain

## Geo (Yandex Geocoder)

In [None]:
from libs.gov_eneff.utils import Geocoder

In [None]:
geocoder = Geocoder()

# removing duplicates
df_bld_geo = db.df_bld[['actual_address']].drop_duplicates().copy()
df_bld_geo['geocoder_response'] = np.nan

df_bld_geo.iloc[:, 1] = df_bld_geo['actual_address'].iloc[:].apply(lambda x: geocoder.by_address(x))

In [None]:
# j = df_bld_geo['geocoder_response'].iloc[0]

# json.dumps(j, ensure_ascii=False, sort_keys=True)

# print(j['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['metaDataProperty']['GeocoderMetaData'] \
#           ['Address']['Components'])

# print(j['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['Point']['pos'])

In [None]:
df_bld_geo.astype({'actual_address': str, 'geocoder_response': str}, skipna=False).to_parquet('data/df_bld_geo.parquet')

## Org Info (Inc. OKVED) from DaData

In [None]:
def get_org_info(inn):
    
    # https://dadata.ru/api/find-party/
    
    # До 10 июля 2016 года использовался старый классификатор ОКВЭД ОК 029-2001 (КДЕС Ред.1), но с 11 июля 2016 года ФНС перешла на новую редакцию классификатора ОКВЭД-2 (версия ОК 029-2014 (КДЕС Ред. 2)).
    # Замену кодов старого классификатора на коды нового классификатора для юридических и физических лиц, зарегистрированных до 11 июля 2016 года, произвели автоматически. С 11 июля 2016 года использование старых кодов строго запрещено. В случае пренебрежения данного требования налоговая инспекция выносит отказы в регистрации. Чтобы не ошибиться в правильности выбора и облегчить поиск новых кодов приводим сравнительную таблицу старых кодов к новым.
    
    auth_token = rt.config['dadata']['auth_token']
    
    headers = {'Content-Type': 'application/json',
               'Accept': 'application/json', 
               'Authorization': f'Token {auth_token}'}
    
    r = requests.get('https://suggestions.dadata.ru/suggestions/api/4_1/rs/findById/party', 
                     params={'query': inn}, headers=headers)
    #time.sleep(.2)
    
    return(r.json())

In [None]:
#get_org_info('5029144163')

In [None]:
# r = get_org_info('5030020966')
# print(json.dumps(r, ensure_ascii=False, indent=4))

In [None]:
df_dadata_orgs = db.df_declr.query('inn == inn')['inn'].drop_duplicates().to_frame()
df_dadata_orgs['org_data'] = df_dadata_orgs.apply(lambda x: get_org_info(x['inn']), axis=1)
df_dadata_orgs = df_dadata_orgs.set_index('inn', drop=True)

In [None]:
df_dadata_orgs.astype({'org_data': str}, skipna=False).to_parquet('data/dadata_orgs_by_inn.parquet')

In [None]:
#print(json.dumps(df_dadata_orgs.loc['5007051116'].iloc[0], ensure_ascii=False, indent=4))

In [None]:
#r = df_dadata_orgs.loc['5034082850'].iloc[0]
#print(r['suggestions'][0]['data'])

In [None]:
records = []

for key, r in df_dadata_orgs['org_data'].iteritems():
    
    if r is None:
        continue
        
    if len(r['suggestions']) == 0:
        continue
        
    try:
        records.append([
            key,
            # r['suggestions'][0]['data']['opf']['full'],
            r['suggestions'][0]['data']['name']['full'],
            r['suggestions'][0]['data']['name']['short'],
            r['suggestions'][0]['data']['state']['status'],
            r['suggestions'][0]['data']['state']['registration_date'],
            r['suggestions'][0]['data']['state']['liquidation_date'],
            r['suggestions'][0]['data']['state']['actuality_date'],
            r['suggestions'][0]['data']['okved'],
            r['suggestions'][0]['data']['okved_type'],
            r['suggestions'][0]['data']['address']['value'],
            r['suggestions'][0]['data']['address']['data']['region_with_type'],
            r['suggestions'][0]['data']['address']['data']['area'],
            r['suggestions'][0]['data']['address']['data']['area_type'],
            r['suggestions'][0]['data']['address']['data']['city'],
            r['suggestions'][0]['data']['address']['data']['city_type'],
            r['suggestions'][0]['data']['address']['data']['timezone'],
            r['suggestions'][0]['data']['address']['data']['geo_lat'],
            r['suggestions'][0]['data']['address']['data']['geo_lon'],
            r['suggestions'][0]['data']['address']['data']['qc_geo'],
            r['suggestions'][0]['data']['qc']])
    except:
        print(key)
        raise

In [None]:
dft = pd.DataFrame(records, columns=['inn', 'full_name', 'short_name', 'status', 
                               'registration_date', 'liquidation_date', 'actuality_date',
                               'okved', 'okved_type',
                               'address', 'region', 'area', 'area_type', 'city', 'city_type',
                               'timezone', 'geo_lat', 'geo_lon', 'geo_qc', 'data_qc']).set_index('inn')

dft['registration_date'] = pd.to_datetime(dft['registration_date'], unit='ms')
dft['liquidation_date']  = pd.to_datetime(dft['liquidation_date'], unit='ms')
dft['actuality_date']    = pd.to_datetime(dft['actuality_date'], unit='ms')

In [None]:
df_dadata_orgs2 = df_dadata_orgs.merge(dft, left_index=True, right_index=True, how='left')

In [None]:
df_dadata_orgs2.to_excel('data/orgs_dadata.xlsx')

In [None]:
df_dadata_orgs2['okved'].isna().mean()

# References

- http://economy.gov.ru/minec/activity/sections/classificators/