In [1]:
import os
import sys

import pandas
import floweaver
from ipysankeywidget import SankeyWidget

from variable_labels import VAR_LABELS

# Define analysis focus ("emo" or "edu")
ANALYSIS_FOCUS = "edu"

# Order by origin or outcome.
ORDER_BY_ORIGIN = True

# Auto-detect the current directory.
DIR = os.getcwd()
OUTDIR = os.path.join(DIR, "output_{}".format(ANALYSIS_FOCUS))

# Create a colour palette.
palette = { \
    "S1_Parenting_Beliefs":                 "#e9b96e", # brown
    "S1_Parent_GeneralHealth":              "#ad7fa8", # purple
    "S1_ParentMentalHealth":                "#c4a000", # yellow
    "S1_LocusControl":                      "#f2c600", # yellow (f2c600)
    "S1_ParentSelfEsteem":                  "#edd400", # yellow
    "S1_ParentLifeSatisfaction":            "#fce94f", # yellow
    "S1_ParentRelationshipQuality":         "#8f5902", # brown
    "S1_Fam_FatherPresent":                 "#c17d11", # brown
    "S1_Health_SmokedThroughoutPregnancy":  "#5c3566", # purple
    "S1_Health_CurrentSmoking":             "#75507b", # purple
    "S1_Parent_AlcoholPregnant":            "#204a87", # blue
    "S1_Parent_AlcoholCurrent":             "#3465a4", # blue
    "S1_Fam_NSiblings":                     "#ce5c00", # orange
    "S1_SES_Income_TotalHousehold":         "#a40000", # red
    "S1_SES_McClementsPovertyLine":         "#fcaf3e", # orange
    "S1_SES_AverageLastJob":                "#cc0000", # red
    "S1_SES_EducationAvg":                  "#ef2929", # red
    "S1_Parent_Neighbourhood":              "#f57900", # orange
    "AirPollution_Sulphur":                 "#2e3436", # grey
    "AirPollution_Particularate":           "#555753", # grey
    "AirPollution_Nitrogen":                "#888a85", # grey
    "AirPollution_CarbonMonoxide":          "#babdb6", # grey
    "S1_EarlyDevelopment":                  "#4e9a06", # green
    "S1_Health_DaysPreTerm":                "#8ae234", # green
    "S2_Sex":                               "#ff69b4", # pink

    "S5_Teach_Bullied": "#c4a000", # yellow
    "S5_ChildQ_SportsGames": "#75507b", # purple
    "S5_Teach_AA_PE": "#5c3566", #purple
    "S5_Cog_VS_Raw": "#204a87", # blue
    "S5_Cog_SWM_TotalErrors48": "#3465a4", # blue
    "S5_Teach_STEM": "#4e9a06", # green
    "S5_Teach_English": "#4e9a06", # green
    "S5_Teach_creative": "#73d216", # green
    "S5_Teach_school": "#8ae234", # green
    "S5_Teach_BullyOthers": "#e9b96e", # brown
    "S5_ChildQ_Feelings_Happy": "#d3d7cf", # grey
    "S5_ChildQ_Feelings_Laugh": "#eeeeec", # grey
    "S5_ChildQ_Feelings_Angry": "#babdb6", # grey
    "S5_ChildQ_Feelings_Worry": "#888a85", # grey
    "S5_ChildQ_Feelings_Sad": "#555753", # grey
    "S5_ChildQ_Feelings_Afraid": "#2e3436", # grey
    "S5_Parent_SDQ_Emotion": "#a40000", # red
    "S5_Parent_SDQ_Conduct": "#f57900", # orange
    "S5_Parent_SDQ_Hyper": "#ce5c00", # orange
    "S5_Parent_SDQ_Peer": "#cc0000", # red
    "S5_Parent_SDQ_Prosocial": "#ef2929", #red

    "S6_RiskyBehaviours_Cig": "#8f5902", # brown
    "S6_RiskyBehaviours_Alcohol": "#8f5902", # brown
    "S6_Cog_WordScore": "#204a87", # blue
    "S6_IllegalBehav": "#c17d11", # brown
    "S6_BullyPeople": "#e9b96e", # brown
    "S6_ChildQ_Truancy_Ever": "#c4a000", # yellow
    "S6_ChildQ_SelfHarm": "#babdb6", # grey
    "S6_Wellbeing": "#888a85", # grey
    "S6_SelfEsteem": "#555753", # grey
    "S6_Feelings": "#2e3436", # grey
    "S6_NegFeelings": "#2e3436", # grey
    "S6_Parent_SDQ_Emotion": "#a40000", # red
    "S6_Parent_SDQ_Conduct": "#f57900", # orange
    "S6_Parent_SDQ_Hyper": "#ce5c00", # orage
    "S6_Parent_SDQ_Peer": "#cc0000", # red
    "S6_Parent_SDQ_Prosocial": "#ef2929", # red
}

# Copy all the colours to the human-readable names.
og_names = list(palette.keys())
for varname in og_names:
    palette[VAR_LABELS[varname]] = palette[varname]


In [2]:
# Load the connection data.
fpath = os.path.join(OUTDIR, "connections.csv")
all_connections = pandas.read_csv(fpath)
print(all_connections.head(3))
# Sort the values from highest to lowest.
all_connections.sort_values(by=['value'], ascending=False, inplace=True)
print(all_connections.head(3))

                 source                 target     value       sweep_2  \
0  S1_Parenting_Beliefs  S5_ChildQ_SportsGames  0.000337  S2_Parenting   
1  S1_Parenting_Beliefs         S5_Teach_AA_PE  0.000086  S2_Parenting   
2  S1_Parenting_Beliefs          S5_Cog_VS_Raw  0.000085  S2_Parenting   

                   sweep_3             sweep_4  
0  S3_Parenting_Activities  S4_Par_Involvement  
1  S3_Parenting_Activities  S4_Par_Involvement  
2  S3_Parenting_Activities  S4_Par_Involvement  
                            source               target     value  \
9821                        S2_Sex  S5_Parent_SDQ_Hyper  0.014640   
2711      S1_Health_CurrentSmoking  S5_Parent_SDQ_Hyper  0.010239   
5059  S1_SES_McClementsPovertyLine  S5_Parent_SDQ_Hyper  0.008368   

           sweep_2              sweep_3       sweep_4  
9821  S2_SDQ_Hyper  S3_Parent_SDQ_Hyper  S4_SDQ_Hyper  
2711  S2_SDQ_Hyper  S3_Parent_SDQ_Hyper  S4_SDQ_Hyper  
5059  S2_SDQ_Hyper  S3_Parent_SDQ_Hyper  S4_SDQ_Hyper  


In [3]:
## Loop through all numbers of connections to filter and save.
#for n_conn in [100, 1000, 10000]:

# Stupidly, this whole thing only works interactively, so we
# can't loop automatically. Manually change the connection
# number below.
n_conn = 10

# Select only the highes n connections.
connections = all_connections.head(n_conn)

# Create partitions for the second until the penultimate sweep.
header = connections.columns.values.tolist()
sweeps = {}
for key in header:
    if key[:6] == "sweep_":
        sweeps[key] = floweaver.Partition.Simple(key, connections[key].unique())
# Create nodes for all the sweeps.
nodes = {}
nodes["sweep_1"] = floweaver.ProcessGroup(list(connections["source"]))
for key in sweeps.keys():
    nodes[key] = floweaver.Waypoint(sweeps[key])
nodes["sweep_{}".format(len(nodes.keys())+1)] = floweaver.ProcessGroup( \
    list(connections["target"]))
# Create the right ordering.
ordering = list(nodes.keys())
ordering.sort()
# Add partitions.
if ORDER_BY_ORIGIN:
    origin = floweaver.Partition.Simple("source", connections["source"].unique())
    nodes[ordering[0]].partition = origin
    nodes[ordering[-1]].partition = floweaver.Partition.Simple("target", \
        connections["target"].unique())
else:
    # Partition on the basis of outcome
    outcome = floweaver.Partition.Simple("target", connections["target"].unique())
    nodes[ordering[0]].partition = floweaver.Partition.Simple("source", \
        connections["source"].unique())
    nodes[ordering[-1]].partition = outcome
# Create bundles for all connections.
bundles = [floweaver.Bundle(ordering[0], ordering[-1], \
    waypoints=ordering[1:-1])]
# The ordering needs to be in lists, for whatever reason.
for i in range(len(ordering)):
    ordering[i] = [ordering[i]]
# Create the Sankey definition.
if ORDER_BY_ORIGIN:
    sankey = floweaver.SankeyDefinition(nodes, bundles, ordering, flow_partition=origin)
else:
    sankey = floweaver.SankeyDefinition(nodes, bundles, ordering, flow_partition=outcome)

# Weave the plot together.
sankey_data = floweaver.weave(sankey, connections, palette=palette)
# Save as JSON.
#fpath = os.path.join(OUTDIR, "sankey_{}-connections.json".format(n_conn))
#sankey_data.to_json(fpath, format="widget")

# Create a widget, and save as PNG upon rendering.
fpath = os.path.join(OUTDIR, "sankey_{}-connections.png".format(n_conn))
size = {"width":1500, "height":800}
w = sankey_data.to_widget(**size).auto_save_png(fpath)
