# Exploratory Analysis of table `actor_text_property`

In [None]:
# Python lib
import os
import pandas as pd
import datetime

# External lib
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']

# Local lib
import toolkit as tk

# Connect to db
tk.db_connect(os.environ.get('YELLOW_BHP'), verbose=False)

# Fetch data
actor_text_property = tk.db_execute('select * from bhp.actor_text_property')

## Table extract

In [None]:
actor_text_property.sample(5)

## Discovery

In [None]:
tk.discover(actor_text_property, uniq_ex_nb=3)

## Type parsing

According to the table before, we will parse each column by the most meaningful type.

In [None]:
tk.set_types(actor_text_property, {
'pk_actor_text_property': 'int',
         'property_type': 'string',
                  'text': 'text',
              'fk_actor': 'int',
           'concat_actp': 'string',
         'creation_time': 'datetime',
               'creator': 'int',
         'lang_iso_code': 'string',
              'modifier': 'int',
     'modification_time': 'datetime',
                 'notes': 'string',
})     

## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

For some of the column, we will update their value.

### property_type

'notice web' and 'notice_web' are being merged.

In [None]:
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice_web', 'notice web')

tk.histogram(actor_text_property, 'property_type', 'Property type distribution', style='pie', colors=gv_color_seq)

Moreover, according to the wiki page, 'notice_web' and 'notice' would then be merged.

In [None]:
actor_text_property['property_type'] = actor_text_property['property_type'].replace('notice web', 'notice')

### text

All HTML tags, non ASCII chars and new line are removed.

In [None]:
actor_text_property.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor_text_property.replace({'\r\n':''}, regex=True, inplace=True)
actor_text_property.replace({'<p>':''}, regex=True, inplace=True)
actor_text_property.replace({'</p>':''}, regex=True, inplace=True)

### creation_time

In [None]:
px.violin(actor_text_property, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [None]:
tk.histogram(actor_text_property, 'creator', 'Creator distribution', 10)

### lang_iso_code

In [None]:
tk.histogram(actor_text_property, 'lang_iso_code', 'Language distribution', style='pie', colors=gv_color_seq)

---

In [None]:
actor_text_property.to_csv('../../data/actor_text_property.csv', index=False, sep=';', quoting=2)