In [1]:
actor_text_property_path = '../../data/bhp/actor_text_property.csv'

In [2]:
import os, datetime
import pandas as pd

import geovpylib.utils as u
import geovpylib.analysis as a

u.db_connect(os.environ.get('YELLOW_BHP'))

>> Connecting to PGSQL Database ... Connected!


## Fetch full table

In [3]:
actor_text_property = u.db_execute('select * from bhp.actor_text_property')

## Remove binary characters

In [4]:
actor_text_property.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor_text_property.replace({'\r\n':''}, regex=True, inplace=True)
actor_text_property.replace({'<p>':''}, regex=True, inplace=True)
actor_text_property.replace({'</p>':''}, regex=True, inplace=True)

## Discovery

In [5]:
a.discover(actor_text_property, uniq_ex_nb=3)

Columns contain:
Total number of rows: 53887
  - "pk_actor_text_property":   0.00% empty - 53887 (100.00%) uniques (eg: 29364; 29366; 17991)
  -          "property_type":   0.00% empty -     4 (  0.01%) uniques (eg: notice; notice_web; complment)
  -                   "text":   0.00% empty - 38278 ( 71.03%) uniques (eg: Directeur ...; Conseiller...; Il a t pro...)
  -               "fk_actor":   0.00% empty - 45931 ( 85.24%) uniques (eg: 47735; 47736; 40250)
  -            "concat_actp":   0.00% empty - 53887 (100.00%) uniques (eg: AcTP29364; AcTP29366; AcTP17991)
  -          "creation_time":   0.00% empty - 30407 ( 56.43%) uniques (eg: 2013-12-19...; 2013-12-19...; 2010-11-18...)
  -                "creator":   0.01% empty -    87 (  0.16%) uniques (eg: 2.0; 50.0; 3.0)
  -          "lang_iso_code":   2.79% empty -     6 (  0.01%) uniques (eg: fra; None; ita)
  -               "modifier":  13.57% empty -    82 (  0.15%) uniques (eg: 2.0; 50.0; 3.0)
  -      "modification_time":  42.69

## Type parsing

In [6]:
a.set_types(actor_text_property, {
    'pk_actor_text_property': 'int',
            'property_type': 'string',
                    'text': 'text',
                'fk_actor': 'int',
            'concat_actp': 'string',
            'creation_time': 'datetime',
                'creator': 'int',
            'lang_iso_code': 'string',
                'modifier': 'int',
        'modification_time': 'datetime',
                    'notes': 'string',
})     

## Handle data

In [7]:
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice_web', 'notice web')

# According to the wiki page, 'notice_web' and 'notice' would then be merged.
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice web', 'notice')

## Save data

In [8]:
u.save_df(actor_text_property, actor_text_property_path)