# Exploratory Analysis of table `actor_text_property`

In [1]:
# Python lib
import os
import pandas as pd
import datetime

# External lib
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']


# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'), verbose=False)

actor_text_property = tk.db_execute('select * from bhp.actor_text_property')

## Table extract

In [2]:
actor_text_property.sample(5)

Unnamed: 0,pk_actor_text_property,property_type,lang_iso_code,text,notes,fk_actor,creator,modifier,creation_time,modification_time,concat_actp
3330,55124,complément,fra,"<p>Père : Louis, Auguste Rougier (Lyon 1792-Ly...",<p>361</p>,56387,51.0,51.0,2014-10-02 17:14:43.280,2014-12-17 13:22:02,AcTP55124
12405,53199,complément,deu,Lehrer; Naturwissenschaftler,fb_import_20140912_3133,54310,3.0,,2014-09-12 12:22:41.130,NaT,AcTP53199
41113,7159,notice_web,fra,"Né à Rives (Isère), diplômé des Arts et Métier...",,26293,11.0,11.0,2010-01-23 18:33:34.000,2013-12-18 15:24:16,AcTP7159
16554,25906,notice,fra,"Né à Magny-en-Vexin, en 1889, ARASSH : Rhône (...",,9715,11.0,11.0,2008-07-18 19:35:19.000,2013-12-18 15:24:16,AcTP25906
4742,33187,notice_web,fra,<p>confrère de l'Oratoire de France</p>,,50746,69.0,69.0,2014-07-21 15:18:09.870,NaT,AcTP33187


## Discovery

In [3]:
tk.discover(actor_text_property, uniq_ex_nb=3)

Columns contain:
Total number of rows: 53887
  - "pk_actor_text_property":   0.00% empty - 53887 (100.00%) uniques (eg: 29364; 29366; 17991)
  -          "property_type":   0.00% empty -     4 (  0.01%) uniques (eg: notice; notice_web; complément)
  -                   "text":   0.00% empty - 38518 ( 71.48%) uniques (eg: <p>Directe...; <p>Conseil...; <p>Il a ét...)
  -               "fk_actor":   0.00% empty - 45931 ( 85.24%) uniques (eg: 47735; 47736; 40250)
  -            "concat_actp":   0.00% empty - 53887 (100.00%) uniques (eg: AcTP29364; AcTP29366; AcTP17991)
  -          "creation_time":   0.00% empty - 30407 ( 56.43%) uniques (eg: 2013-12-19...; 2013-12-19...; 2010-11-18...)
  -                "creator":   0.01% empty -    87 (  0.16%) uniques (eg: 2.0; 50.0; 3.0)
  -          "lang_iso_code":   2.79% empty -     6 (  0.01%) uniques (eg: fra; None; ita)
  -               "modifier":  13.57% empty -    82 (  0.15%) uniques (eg: 2.0; 50.0; 3.0)
  -      "modification_time":  42.6

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [4]:
tk.set_types(actor_text_property, {
'pk_actor_text_property': 'int',
         'property_type': 'string',
                  'text': 'text',
              'fk_actor': 'int',
           'concat_actp': 'string',
         'creation_time': 'datetime',
               'creator': 'int',
         'lang_iso_code': 'string',
              'modifier': 'int',
     'modification_time': 'datetime',
                 'notes': 'string',
})     

## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

For some of the column, we will update their value.

### property_type

'notice web' and 'notice_web' are being merged.

In [7]:
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice_web', 'notice web')

tk.histogram(actor_text_property, 'property_type', 'Property type distribution', style='pie', colors=gv_color_seq)

### text

All HTML tags, non ASCII chars and new line are removed.

In [13]:
actor_text_property.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor_text_property.replace({'\r\n':''}, regex=True, inplace=True)
actor_text_property.replace({'<p>':''}, regex=True, inplace=True)
actor_text_property.replace({'</p>':''}, regex=True, inplace=True)

### creation_time

In [12]:
px.violin(actor_text_property, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [14]:
tk.histogram(actor_text_property, 'creator', 'Creator distribution', 10)

### lang_iso_code

In [17]:
tk.histogram(actor_text_property, 'lang_iso_code', 'Language distribution', style='pie', colors=gv_color_seq)

---

In [None]:
actor_text_property.to_csv('../../data/actor_text_property.csv', index=False, sep=';')