In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

from importlib.util import find_spec
import sys

if find_spec('storage') is None:
    if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))
from collections import Counter
from storage import Storage
from pandas import DataFrame
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import csv
import networkx as nx
import numpy as np
import os
import pandas as pd
import re
import warnings

s = Storage()
warnings.filterwarnings('ignore')  # silence warnings

In [5]:

# Get the maximum element styles list (largest list of styles that appears in a single style tag) and add it to the css
file_path = os.path.join(s.data_folder, 'svg', 'scratchpad.svg')
print(os.path.abspath(file_path))
with open(file_path, 'r', encoding=s.encoding_type) as f:
    svg_str = f.read()
style_regex = re.compile(r'\bstyle="([^"]+)"')
styles_lists_array = [[x.strip() for x in styles_list.split(';') if x.strip()] for styles_list in style_regex.findall(svg_str)]
vocabulary_set = set()
for sublist in styles_lists_array: vocabulary_set.update(sublist)
bag_of_words = [dict(Counter(sublist)) for sublist in styles_lists_array]
for i, sublist in enumerate(bag_of_words):
    bag_of_words[i] = {word: sublist.get(word, 0) for word in vocabulary_set}
DataFrame(bag_of_words).sum().sort_values(ascending=False).head(20)

C:\Users\daveb\OneDrive\Documents\GitHub\notebooks\data\svg\scratchpad.svg


Series([], dtype: float64)

In [21]:

style_delimiter = ''';
            '''
for i, styles_list in enumerate(sorted(styles_lists_array, key=lambda x: len(x), reverse=True)):
    print(f'''
        .style{i} {{
            {style_delimiter.join(sorted([x.replace(':', ': ') for x in styles_list]))};
        }}''')


        .style0 {
            overflow: visible;
        }

        .style1 {
            color-interpolation-filters: sRGB;
        }

        .style2 {
            display: none;
        }

        .style3 {
            display: inline;
        }

        .style4 {
            display: inline;
        }


In [54]:

max_sublist = DataFrame(bag_of_words).sum().sort_values(ascending=False).head(6).index.tolist()
style_delimiter = ''';
             '''
search_str = '''
      </style>'''
replace_str = f'''
          .black-stroke {{
             {style_delimiter.join(sorted([x.replace(':', ': ') for x in max_sublist]))};
          }}{search_str}'''
svg_str = re.sub(search_str, replace_str, svg_str)
with open(file_path, 'w', encoding=s.encoding_type) as f:
    print(svg_str, file=f)

In [75]:

compile_list = '|'.join(max_sublist)
regex_str = f'''style="([^"]*)({compile_list});?([^"]*)"'''
print(regex_str)
style_regex = re.compile(regex_str)

style="([^"]*)(stroke:#000000|stroke-linecap:butt|stroke-linejoin:miter|stroke-opacity:1|stroke-miterlimit:4|stroke-dashoffset:0);?([^"]*)"



----

In [3]:

# Clean up the contact list to minimize double quotes
csv_name = 'itm_kick-off_meeting_attendee_contact_list'
if not s.csv_exists(csv_name=csv_name, folder_path=s.saves_csv_folder, verbose=False):
    contact_list_df = s.load_csv(csv_name)
    mask_series = (contact_list_df.last_name == 'Picucci')
    contact_list_df.loc[mask_series, 'first_name'] = 'PM "Pooch"'
    s.save_dataframes(**{csv_name: contact_list_df})


## Structure from Domain Knowledge
### We can manually define a structure model by specifying the relationships between different features.

In [3]:

def get_structure_plot(sm, file_path):
    os.makedirs(name=os.path.dirname(file_path), exist_ok=True)
    Path(file_path).touch()
    viz = plot_structure(
        sm,
        all_node_attributes=NODE_STYLE.WEAK,
        all_edge_attributes=EDGE_STYLE.WEAK,
    )
    
    return viz

In [4]:

# s.ensure_module_installed('networkx --force-reinstall', upgrade=True, verbose=False)
# s.ensure_module_installed('pygraphviz', upgrade=False, verbose=True)
s.ensure_module_installed('causalnex', upgrade=False, verbose=False)
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas

# First, we must create an empty structure model
sm = StructureModel()


<p>Next, we can specify the relationships between features:</p>
<img src="../saves/png/FAITH-MD_Evaluation_infrastructure.png" />

In [7]:

# We can add these relationships into our structure model:
sm.add_edges_from([
    ('pas', 'sg'),
    ('adm', 'al'),
    ('al', 'adm'),
    ('sg', 'al'),
    ('sg', 'cp'),
    ('hdm', 'sg'),
    ('rd', 'hdm'),
    ('cp', 'rd'),
    ('rd', 'aa'),
    ('rd', 'adm'),
    ('aa', 'adm')
])


<h3>Visualising the Structure</h3>
<p>We can examine a StructureModel by looking at the output of <code class="docutils literal notranslate"><span class="pre">sm.edges</span></code></p>

In [8]:

sm.edges

OutEdgeView([('pas', 'sg'), ('sg', 'al'), ('sg', 'cp'), ('adm', 'al'), ('al', 'adm'), ('cp', 'rd'), ('hdm', 'sg'), ('rd', 'hdm'), ('rd', 'aa'), ('rd', 'adm'), ('aa', 'adm')])


<p>but it can often be more intuitive to visualise it. CausalNex provides a plotting module that allows us to do this.</p>

In [9]:

file_path = '../saves/html/01_simple_plot.html'
viz = get_structure_plot(sm, file_path)
viz.show(file_path)

../saves/html/01_simple_plot.html
