# Exploratory Analysis of table `actor_text_property`

In [1]:
# Python lib
import os
import pandas as pd
import datetime

# External lib
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']

# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'), verbose=False)

# Fetch data
actor_text_property = tk.db_execute('select * from bhp.actor_text_property')

## Table extract

In [2]:
actor_text_property.sample(5)

Unnamed: 0,pk_actor_text_property,property_type,lang_iso_code,text,notes,fk_actor,creator,modifier,creation_time,modification_time,concat_actp
40158,3132,notice_web,fra,"Né à Lyon en 1862, diplômé de l'École supérieu...",,22244,11.0,11.0,2009-06-18 10:50:31.000,2013-12-18 15:24:16,AcTP3132
53704,74246,notice_web,fra,<p>Enseigne la discipline de Philosophie natur...,Notice générée automatiquement à partir de l'i...,2380,50.0,50.0,2017-11-30 15:17:44.290,NaT,AcTP74246
38776,27479,notice,fra,<p>Religieuse professe du Carmel de l'Incarnat...,,45489,1.0,1.0,2012-04-27 22:02:56.970,2021-11-22 16:12:42,AcTP27479
16186,65745,complément,fra,D633,494,56812,51.0,51.0,2014-12-20 22:34:22.890,NaT,AcTP65745
37349,24109,notice,fra,"Né à Mardore, en 1883, industriel ; ARASSH : C...",,7942,11.0,11.0,2008-07-18 19:32:39.000,2013-12-18 15:24:16,AcTP24109


## Discovery

In [3]:
tk.discover(actor_text_property, uniq_ex_nb=3)

Columns contain:
Total number of rows: 53887
  - "pk_actor_text_property":   0.00% empty - 53887 (100.00%) uniques (eg: 29364; 29366; 17991)
  -          "property_type":   0.00% empty -     4 (  0.01%) uniques (eg: notice; notice_web; complément)
  -                   "text":   0.00% empty - 38518 ( 71.48%) uniques (eg: <p>Directe...; <p>Conseil...; <p>Il a ét...)
  -               "fk_actor":   0.00% empty - 45931 ( 85.24%) uniques (eg: 47735; 47736; 40250)
  -            "concat_actp":   0.00% empty - 53887 (100.00%) uniques (eg: AcTP29364; AcTP29366; AcTP17991)
  -          "creation_time":   0.00% empty - 30407 ( 56.43%) uniques (eg: 2013-12-19...; 2013-12-19...; 2010-11-18...)
  -                "creator":   0.01% empty -    87 (  0.16%) uniques (eg: 2.0; 50.0; 3.0)
  -          "lang_iso_code":   2.79% empty -     6 (  0.01%) uniques (eg: fra; None; ita)
  -               "modifier":  13.57% empty -    82 (  0.15%) uniques (eg: 2.0; 50.0; 3.0)
  -      "modification_time":  42.6

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [4]:
tk.set_types(actor_text_property, {
'pk_actor_text_property': 'int',
         'property_type': 'string',
                  'text': 'text',
              'fk_actor': 'int',
           'concat_actp': 'string',
         'creation_time': 'datetime',
               'creator': 'int',
         'lang_iso_code': 'string',
              'modifier': 'int',
     'modification_time': 'datetime',
                 'notes': 'string',
})     

## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

For some of the column, we will update their value.

### property_type

'notice web' and 'notice_web' are being merged.

In [5]:
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice_web', 'notice web')

# tk.histogram(actor_text_property, 'property_type', 'Property type distribution', style='pie', colors=gv_color_seq)

Moreover, according to the wiki page, 'notice_web' and 'notice' would then be merged.

In [6]:
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice web', 'notice')

### text

All HTML tags, non ASCII chars and new line are removed.

In [7]:
actor_text_property.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor_text_property.replace({'\r\n':''}, regex=True, inplace=True)
actor_text_property.replace({'<p>':''}, regex=True, inplace=True)
actor_text_property.replace({'</p>':''}, regex=True, inplace=True)

### creation_time

In [8]:
# px.violin(actor_text_property, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [9]:
# tk.histogram(actor_text_property, 'creator', 'Creator distribution', 10)

### lang_iso_code

In [10]:
# tk.histogram(actor_text_property, 'lang_iso_code', 'Language distribution', style='pie', colors=gv_color_seq)

---

In [11]:
actor_text_property.to_csv('../../data/actor_text_property.csv', index=False, sep=';', quoting=2)