In [1]:
actor_text_property_path = '../../data/actor_text_property.csv'

# Exploratory Analysis of table `actor_text_property`

In [2]:
import pandas as pd

import plotly.express as px
import plotly.io as pio

import geovpylib.utils as u
import geovpylib.analysis as a

pio.renderers.default = "plotly_mimetype+notebook"
gv_color_seq = ['#322659','#44337A','#553C9A','#6B46C1','#805AD5','#9F7AEA','#B794F4','#D6BCFA','#E9D8FD','#FAF5FF','#E9D8FD','#D6BCFA','#B794F4','#9F7AEA','#805AD5','#6B46C1','#553C9A','#44337A','#322659']

actor_text_property = u.read_df(actor_text_property_path)

## Table extract

In [3]:
actor_text_property.sample(5)

Unnamed: 0,pk_actor_text_property,property_type,lang_iso_code,text,notes,fk_actor,creator,modifier,creation_time,modification_time,concat_actp
24665,10280.0,notice,fra,Docteur,,31432.0,14.0,11.0,2010-04-06 20:52:01.000,2013-12-18 15:24:16,AcTP10280
30351,16760.0,notice,fra,Employ de l'octroi de Lyon (1854-1870),,39129.0,24.0,24.0,2010-11-09 12:34:32.000,2013-12-18 15:24:16,AcTP16760
45971,67134.0,notice,fra,"N Lyon, France (1er arrondissement) de Joseph...",,57959.0,2.0,2.0,2015-12-15 15:02:46.400,,AcTP67134
20850,5761.0,notice,fra,"luthrien, allemand, tailleur, mari",,24896.0,31.0,11.0,2010-01-06 20:50:49.000,2013-12-18 15:24:16,AcTP5761
19981,4605.0,notice,fra,"Arts et Mtiers (Aix, 1882), employ Fives-Lille",,23730.0,11.0,11.0,2009-11-26 10:07:15.000,2013-12-18 15:24:16,AcTP4605


## Discovery

In [4]:
a.discover(actor_text_property, uniq_ex_nb=3)

Columns contain:
Total number of rows: 53887
  - "pk_actor_text_property":   0.00% empty - 53887 (100.00%) uniques (eg: 29364.0; 29366.0; 17991.0)
  -          "property_type":   0.00% empty -     2 (  0.00%) uniques (eg: notice; complment)
  -               "fk_actor":   0.00% empty - 45931 ( 85.24%) uniques (eg: 47735.0; 47736.0; 40250.0)
  -            "concat_actp":   0.00% empty - 53887 (100.00%) uniques (eg: AcTP29364; AcTP29366; AcTP17991)
  -          "creation_time":   0.00% empty - 30407 ( 56.43%) uniques (eg: 2013-12-19...; 2013-12-19...; 2010-11-18...)
  -                   "text":   0.00% empty - 38278 ( 71.03%) uniques (eg: Directeur ...; Conseiller...; Il a t pro...)
  -                "creator":   0.01% empty -    87 (  0.16%) uniques (eg: 2.0; 50.0; 3.0)
  -          "lang_iso_code":   2.79% empty -     6 (  0.01%) uniques (eg: fra; nan; ita)
  -               "modifier":  13.57% empty -    82 (  0.15%) uniques (eg: 2.0; 50.0; 3.0)
  -      "modification_time":  42.69%

## Type parsing

In [5]:
a.set_types(actor_text_property, {
'pk_actor_text_property': 'int',
         'property_type': 'string',
                  'text': 'text',
              'fk_actor': 'int',
           'concat_actp': 'string',
         'creation_time': 'datetime',
               'creator': 'int',
         'lang_iso_code': 'string',
              'modifier': 'int',
     'modification_time': 'datetime',
                 'notes': 'string',
})     

## Columns analysis

Here we will report the analysis of interesting information found on different columns. They are not exhaustive.

### property_type

In [6]:
a.histogram(actor_text_property, 'property_type', 'Property type distribution', style='pie', colors=gv_color_seq)

### text

All HTML tags, non ASCII chars and new line are removed.

In [None]:
actor_text_property.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
actor_text_property.replace({'\r\n':''}, regex=True, inplace=True)
actor_text_property.replace({'<p>':''}, regex=True, inplace=True)
actor_text_property.replace({'</p>':''}, regex=True, inplace=True)

### creation_time

In [None]:
px.violin(actor_text_property, x='creation_time', title='Violin plot of the "creation_time" column')

### creator

In [None]:
a.histogram(actor_text_property, 'creator', 'Creator distribution', 10)

### lang_iso_code

In [None]:
a.histogram(actor_text_property, 'lang_iso_code', 'Language distribution', style='pie', colors=gv_color_seq)