In [1]:
import warnings
import numpy as np
import pandas as pd
# 
from causalnex.structure import StructureModel

In [2]:
sm = StructureModel()
sm.add_edges_from([
    ('health', 'absences'),
    ('health', 'G1')
])
sm = StructureModel()
sm.add_edges_from([
    ('health', 'absences'),
    ('health', 'G1')
])
str(sm)

'StructureModel with 3 nodes and 2 edges'

In [3]:
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
str(viz)

'{\n    "Nodes": [\n        "health",\n        "absences",\n        "G1"\n    ],\n    "Edges": [\n        {\n            "color": "#ffffffaa",\n            "arrows": {\n                "to": {\n                    "enabled": true,\n                    "scaleFactor": 0.5\n                }\n            },\n            "endPointOffset": {\n                "from": 1,\n                "to": -1\n            },\n            "width": 2,\n            "length": 15,\n            "from": "health",\n            "to": "absences"\n        },\n        {\n            "color": "#ffffffaa",\n            "arrows": {\n                "to": {\n                    "enabled": true,\n                    "scaleFactor": 0.5\n                }\n            },\n            "endPointOffset": {\n                "from": 1,\n                "to": -1\n            },\n            "width": 2,\n            "length": 15,\n            "from": "health",\n            "to": "G1"\n        }\n    ],\n    "Height": "600px",\n   

In [4]:
viz.show("01_simple_plot.html")

01_simple_plot.html


In [4]:
import pandas as pd
data = pd.read_csv('student-por.csv', delimiter=';')
drop_col = ['school', 'sex', 'age', 'Mjob', 'Fjob', 'reason', 'guardian']
data = data.drop(columns=drop_col)
data.head

<bound method NDFrame.head of     address famsize Pstatus  Medu  Fedu  traveltime  studytime  failures  \
0         U     GT3       A     4     4           2          2         0   
1         U     GT3       T     1     1           1          2         0   
2         U     LE3       T     1     1           1          2         0   
3         U     GT3       T     4     2           1          3         0   
4         U     GT3       T     3     3           1          2         0   
..      ...     ...     ...   ...   ...         ...        ...       ...   
644       R     GT3       T     2     3           1          3         1   
645       U     LE3       T     3     1           1          2         0   
646       U     GT3       T     1     1           2          2         0   
647       U     LE3       T     3     1           2          1         0   
648       R     LE3       T     3     2           3          1         0   

    schoolsup famsup  ... famrel freetime goout Dalc Walc

In [7]:
import numpy as np
struct_data = data.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head

<bound method NDFrame.head of      address  famsize  Pstatus  Medu  Fedu  traveltime  studytime  failures  \
0          1        0        0     4     4           2          2         0   
1          1        0        1     1     1           1          2         0   
2          1        1        1     1     1           1          2         0   
3          1        0        1     4     2           1          3         0   
4          1        0        1     3     3           1          2         0   
..       ...      ...      ...   ...   ...         ...        ...       ...   
644        0        0        1     2     3           1          3         1   
645        1        1        1     3     1           1          2         0   
646        1        0        1     1     1           2          2         0   
647        1        1        1     3     1           2          1         0   
648        0        1        1     3     2           3          1         0   

     schoolsup  famsu

In [9]:
from causalnex.structure.notears import from_pandas
from pyvis.network import Network

sm = from_pandas(struct_data)  # tabu_edges to avoid erroneous relationships manually
sm.remove_edges_below_threshold(0.8) # From 650 to 14 edges

from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_thresholded.html")

01_thresholded.html


In [10]:
sm = from_pandas(struct_data, tabu_edges=[("higher", "Medu")], w_threshold=0.8)

viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_edge_added.html")

01_edge_added.html


### Modifying the Structure
To correct erroneous relationships, we can incorporate domain knowledge into the model after structure learning. We can modify the structure model through adding and deleting the edges. For example, we can add and remove edges as:

In [11]:
sm.add_edge("failures", "G1")
sm.remove_edge("Pstatus", "G1")
sm.remove_edge("address", "G1")

In [13]:
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_modified_structure.html")

01_modified_structure.html


In [15]:
sm = sm.get_largest_subgraph()
viz = plot_structure(
    sm,
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
viz.show("01_largest_subgraph.html")

01_largest_subgraph.html


In [21]:
import networkx as nx

nx.drawing.nx_pydot.write_dot(sm, 'graph.dot')

### Fitting the Conditional Distribution of the Bayesian Network

In [37]:
from causalnex.network import BayesianNetwork

bn = BayesianNetwork(sm)

discretised_data = data.copy()

data_vals = {col: data[col].unique() for col in data.columns}

failures_map = {v: 'no-failure' if v == [0]
                else 'have-failure' for v in data_vals['failures']}
studytime_map = {v: 'short-studytime' if v in [1,2]
                 else 'long-studytime' for v in data_vals['studytime']}

discretised_data["failures"] = discretised_data["failures"].map(failures_map)
discretised_data["studytime"] = discretised_data["studytime"].map(studytime_map)

In [38]:
discretised_data.head

<bound method NDFrame.head of     address famsize Pstatus  Medu  Fedu  traveltime        studytime  \
0         U     GT3       A     4     4           2  short-studytime   
1         U     GT3       T     1     1           1  short-studytime   
2         U     LE3       T     1     1           1  short-studytime   
3         U     GT3       T     4     2           1   long-studytime   
4         U     GT3       T     3     3           1  short-studytime   
..      ...     ...     ...   ...   ...         ...              ...   
644       R     GT3       T     2     3           1   long-studytime   
645       U     LE3       T     3     1           1  short-studytime   
646       U     GT3       T     1     1           2  short-studytime   
647       U     LE3       T     3     1           2  short-studytime   
648       R     LE3       T     3     2           3  short-studytime   

         failures schoolsup famsup  ... famrel freetime goout Dalc Walc  \
0      no-failure       yes   

In [39]:
from causalnex.discretiser import Discretiser

discretised_data["absences"] = Discretiser(method="fixed",
                          numeric_split_points=[1, 10]).transform(discretised_data["absences"].values)
discretised_data["G1"] = Discretiser(method="fixed",
                          numeric_split_points=[10]).transform(discretised_data["G1"].values)
discretised_data["G2"] = Discretiser(method="fixed",
                          numeric_split_points=[10]).transform(discretised_data["G2"].values)
discretised_data["G3"] = Discretiser(method="fixed",
                          numeric_split_points=[10]).transform(discretised_data["G3"].values)

In [40]:
absences_map = {0: "No-absence", 1: "Low-absence", 2: "High-absence"}

G1_map = {0: "Fail", 1: "Pass"}
G2_map = {0: "Fail", 1: "Pass"}
G3_map = {0: "Fail", 1: "Pass"}

discretised_data["absences"] = discretised_data["absences"].map(absences_map)
discretised_data["G1"] = discretised_data["G1"].map(G1_map)
discretised_data["G2"] = discretised_data["G2"].map(G2_map)
discretised_data["G3"] = discretised_data["G3"].map(G3_map)

discretised_data.head

<bound method NDFrame.head of     address famsize Pstatus  Medu  Fedu  traveltime        studytime  \
0         U     GT3       A     4     4           2  short-studytime   
1         U     GT3       T     1     1           1  short-studytime   
2         U     LE3       T     1     1           1  short-studytime   
3         U     GT3       T     4     2           1   long-studytime   
4         U     GT3       T     3     3           1  short-studytime   
..      ...     ...     ...   ...   ...         ...              ...   
644       R     GT3       T     2     3           1   long-studytime   
645       U     LE3       T     3     1           1  short-studytime   
646       U     GT3       T     1     1           2  short-studytime   
647       U     LE3       T     3     1           2  short-studytime   
648       R     LE3       T     3     2           3  short-studytime   

         failures schoolsup famsup  ... famrel freetime goout Dalc Walc  \
0      no-failure       yes   

In [41]:
# Split 90% train and 10% test
from sklearn.model_selection import train_test_split
train, test = train_test_split(discretised_data, train_size=0.9, test_size=0.1, random_state=7)
bn = bn.fit_node_states(discretised_data)
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")
bn.cpds["G1"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(self._node_states[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(self._node_states[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].map(self._node_states[col])
A value is trying to be set on a copy of a slice from a DataFrame.


failures,have-failure,have-failure,have-failure,have-failure,have-failure,have-failure,have-failure,have-failure,no-failure,no-failure,no-failure,no-failure,no-failure,no-failure,no-failure,no-failure
higher,no,no,no,no,yes,yes,yes,yes,no,no,no,no,yes,yes,yes,yes
schoolsup,no,no,yes,yes,no,no,yes,yes,no,no,yes,yes,no,no,yes,yes
studytime,long-studytime,short-studytime,long-studytime,short-studytime,long-studytime,short-studytime,long-studytime,short-studytime,long-studytime,short-studytime,long-studytime,short-studytime,long-studytime,short-studytime,long-studytime,short-studytime
G1,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Fail,0.75,0.806452,0.5,0.75,0.5,0.612245,0.5,0.75,0.5,0.612903,0.5,0.5,0.032967,0.15016,0.111111,0.255814
Pass,0.25,0.193548,0.5,0.25,0.5,0.387755,0.5,0.25,0.5,0.387097,0.5,0.5,0.967033,0.84984,0.888889,0.744186


In [43]:
discretised_data.loc[18, discretised_data.columns != 'G1']

address                     U
famsize                   GT3
Pstatus                     T
Medu                        3
Fedu                        2
traveltime                  1
studytime     short-studytime
failures         have-failure
schoolsup                  no
famsup                    yes
paid                      yes
activities                yes
nursery                   yes
higher                    yes
internet                  yes
romantic                   no
famrel                      5
freetime                    5
goout                       5
Dalc                        2
Walc                        4
health                      5
absences          Low-absence
G2                       Fail
G3                       Fail
Name: 18, dtype: object

In [44]:
predictions = bn.predict(discretised_data, "G1")
print(f"The prediction is '{predictions.loc[18, 'G1_prediction']}'")

The prediction is 'Fail'


In [45]:
print(f"The ground truth is '{discretised_data.loc[18, 'G1']}'")

The ground truth is 'Fail'


### Model Quality

In [46]:

from causalnex.evaluation import classification_report

classification_report(bn, test, "G1")

{'G1_Fail': {'precision': 0.7777777777777778,
  'recall': 0.5833333333333334,
  'f1-score': 0.6666666666666666,
  'support': 12.0},
 'G1_Pass': {'precision': 0.9107142857142857,
  'recall': 0.9622641509433962,
  'f1-score': 0.9357798165137615,
  'support': 53.0},
 'accuracy': 0.8923076923076924,
 'macro avg': {'precision': 0.8442460317460317,
  'recall': 0.7727987421383649,
  'f1-score': 0.8012232415902141,
  'support': 65.0},
 'weighted avg': {'precision': 0.8861721611721611,
  'recall': 0.8923076923076924,
  'f1-score': 0.8860973888496825,
  'support': 65.0}}

In [47]:
from causalnex.evaluation import roc_auc
roc, auc = roc_auc(bn, test, "G1")
print(auc)

0.9181065088757396
