# Mining Rules

This notebook prepares the files for mining association rules and extract variables/tables for rules we found in analyses. 

In [1]:
import sys
sys.path.insert(0, '../archaeology')

import csv
import pandas as pd

from db import connect, NotebookBoolAggregate, Notebook, Query
import analysis_helpers, importlib
importlib.reload(analysis_helpers)
from analysis_helpers import load_vars
from analysis_helpers import var, dbmt, group_run, DBMT

%matplotlib inline

## Query

Select notebooks that are:
- valid
  
  ```NOT (n.kernel = 'no-kernel' AND n.nbformat = '0')
  AND n.processed & 15 = 0 AND n.skip & (1024 + 512) = 0```


- non-duplicated
  
  ```n.skip & (1024 + 512 + 128) = 0```
  
- executed

  ```n.max_execution_count > -0```
  
- unambiguous

  ```n.processing_cells = 0 AND n.unambiguous = 1```
  
- python notebooks

  ```n.language = 'python'
  AND LEFT(n.language_version, 1) IN ('2', '3')```

In [2]:
vs = load_vars()
threshold = float(vs['s_a0_starforks'])

In [3]:
%%time
with connect() as session:
    print("Query Bool")
    boolagg = pd.read_sql("""
        SELECT
            (n.processed & 16 = 0) as select_valid_syntax,
            (n.language = 'python' AND LEFT(n.language_version, 1) IN ('2', '3')) as select_python,
            (n.max_execution_count > -0) as select_executed,
            (n.processing_cells = 0 AND n.unambiguous = 1) as select_unambiguous,
            (NOT m.original_exe_skipped_dependency_installation) as select_original_exe_attempt_installation,
            (NOT m.original_exe_failed_to_install_dependencies
             OR m.original_exe_skipped_dependency_installation) as select_original_exe_attempt_to_run_all_cells,
            ((NOT m.execorder_failed_to_install_dependencies
             OR m.execorder_skipped_dependency_installation)
             AND NOT EXISTS (
              SELECT 1
              FROM repositories r
              WHERE n.repository_id = r.id
              AND setups_count + requirements_count + pipfiles_count > 0
             )) as select_execorder_attempt_to_run_all_cells,
            ((NOT m.topdown_failed_to_install_dependencies
             OR m.topdown_skipped_dependency_installation)
             AND NOT EXISTS (
              SELECT 1
              FROM repositories r
              WHERE n.repository_id = r.id
              AND setups_count + requirements_count + pipfiles_count > 0
             )) as select_topdown_attempt_to_run_all_cells,
             (n.starforks >= %(threshold)s) as popular,
            m.*, n.starforks
        FROM notebooks n, notebooks_bool_aggregates m
        WHERE NOT (n.kernel = 'no-kernel' AND n.nbformat = '0')
        AND n.processed & 15 = 0
        AND n.skip & 2048 = 0
        AND n.skip & (1024 + 512) = 0
        AND n.skip & (1024 + 512 + 128) = 0
        AND n.max_execution_count > -0
        AND n.processing_cells = 0
        AND n.unambiguous = 1
        AND n.language = 'python'
        AND LEFT(n.language_version, 1) IN ('2', '3')
        AND m.notebook_id = n.id
    """, session.connection(), params={"threshold": threshold})
    
    print("Query Full Bool")
    fullexec = pd.read_sql("""
        SELECT 
            (NOT m.fullexecorder_failed_to_install_dependencies
             OR m.fullexecorder_skipped_dependency_installation
            ) as select_fullexecorder_attempt_to_run_all_cells,
            (NOT m.fulltopdown_failed_to_install_dependencies
             OR m.fulltopdown_skipped_dependency_installation
            ) as select_fulltopdown_attempt_to_run_all_cells,
            m.*
        FROM notebooks n, notebooks_fullexecbool_aggregates m
        WHERE NOT (n.kernel = 'no-kernel' AND n.nbformat = '0')
        AND n.processed & 15 = 0
        AND n.skip & 2048 = 0
        AND n.skip & (1024 + 512) = 0
        AND n.skip & (1024 + 512 + 128) = 0
        AND n.max_execution_count > -0
        AND n.processing_cells = 0
        AND n.unambiguous = 1
        AND n.language = 'python'
        AND LEFT(n.language_version, 1) IN ('2', '3')
        AND m.notebook_id = n.id
    """, session.connection())



Query Bool
Query Full Bool
CPU times: user 1min 14s, sys: 3.13 s, total: 1min 17s
Wall time: 1min 45s


In [4]:
notebooks = boolagg.set_index("id").join(fullexec.set_index("id"), lsuffix="s_").reset_index()

In [5]:
%%time
def dependency_files(row):
    if row["multiple_dependency_files"]:
        return "multiple"
    if row["single_dependency_file"]:
        return "single"
    if row["no_dependencies"]:
        return "0"
    return None
notebooks.loc[:, "dependency_files"] = notebooks.apply(dependency_files, axis=1)
notebooks.drop(columns=[
    "multiple_dependency_files", "single_dependency_file", "no_dependencies",
], inplace=True)


CPU times: user 40.9 s, sys: 2.6 s, total: 43.5 s
Wall time: 41.4 s


In [6]:
notebooks.drop(columns=[
    "title_invalid",
], inplace=True)

In [7]:
notebooks.loc[:, "non_executed_cells"] = notebooks["has_non_executed_cells"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "execution_count"] = notebooks["has_execution_count"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "empty_cells"] = notebooks["has_empty_cells"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "empty_cells_middle"] = notebooks["has_empty_cells_middle"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "empty_cells_end"] = notebooks["has_empty_cells_end"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "skips"] = notebooks["has_skips"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "skips_middle"] = notebooks["has_skips_middle"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "unordered"] = notebooks["is_unordered"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.drop(columns=[
    "has_non_executed_cells",
    "no_non_executed_cells",
    "has_execution_count",
    "no_execution_count",
    "has_empty_cells",
    "no_empty_cells",
    "has_empty_cells_middle",
    "no_empty_cells_middle",
    "has_empty_cells_end",
    "no_empty_cells_end",
    "has_skips",
    "no_skips",
    "has_skips_middle",
    "no_skips_middle",
    "is_unordered",
    "is_ordered",
], inplace=True)

In [8]:
notebooks.drop(columns=[
    x for x in notebooks.columns.tolist() if x.endswith("s_")
], inplace=True)

In [9]:
notebooks.loc[:, "outputs"] = notebooks["outputs_any"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.drop(columns=[
    "outputs_any",
    "no_outputs",
], inplace=True)

In [10]:
notebooks.drop(columns=[
    "has_markdown_h1",
    "has_markdown_h2",
    "has_markdown_h3",
    "has_markdown_h4",
    "has_markdown_h5",
    "has_markdown_h6",
    "has_markdown_different_than_p_header",
], inplace=True)

In [11]:
notebooks.loc[:, "controlflow"] = notebooks["defines_controlflow"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "imports"] = notebooks["defines_imports"].apply(
    lambda x: None if x is None else ("1" if x else "0")                                                                               
)
notebooks.loc[:, "definitions"] = notebooks["no_definitions"].apply(
    lambda x: None if x is None else ("0" if x else "1")                                                                               
)
notebooks.drop(columns=[
    "ast_variable",
    "ast_definition",
    "ast_module_import",
    "ast_data_structures",
    "ast_comprehension",
    "ast_loop",
    "ast_condition",
    "ast_exception",
    "defines_controlflow",
    "defines_imports",
    "defines_classes",
    "defines_functions",
    "no_controlflow",
    "no_definitions",
    "no_imports",
], inplace=True)

In [12]:
notebooks.drop(columns=[
    'original_exe_notebook_read_error',
    'topdown_installed_dependencies',
    'topdown_failed_to_install_dependencies',
    'topdown_failed_setup_py',
    'topdown_failed_requirements_txt',
    'topdown_failed_pipfile',
    'topdown_install_failed_due_to_missing',
    'topdown_install_failed_due_to_malformed',
    'topdown_install_failed_due_to_python_dependency',
    'topdown_install_failed_due_to_external_dependency',
    'topdown_install_failed_due_to_system',
    'topdown_install_failed_due_to_python_version',
    'topdown_install_failed_due_to_access_error',
    'topdown_install_failed_due_to_unknown',
    'topdown_install_failed_combined_missing',
    'topdown_did_not_found_install_failure_reason',
    'topdown_notebook_read_error',
    'execorder_installed_dependencies',
    'execorder_failed_to_install_dependencies',
    'execorder_failed_setup_py',
    'execorder_failed_requirements_txt',
    'execorder_failed_pipfile',
    'execorder_install_failed_due_to_missing',
    'execorder_install_failed_due_to_malformed',
    'execorder_install_failed_due_to_python_dependency',
    'execorder_install_failed_due_to_external_dependency',
    'execorder_install_failed_due_to_system',
    'execorder_install_failed_due_to_python_version',
    'execorder_install_failed_due_to_access_error',
    'execorder_install_failed_due_to_unknown',
    'execorder_install_failed_combined_missing',
    'execorder_did_not_found_install_failure_reason',
    'execorder_notebook_read_error',
    'fulltopdown_installed_dependencies',
    'fulltopdown_failed_to_install_dependencies',
    'fulltopdown_failed_setup_py',
    'fulltopdown_failed_requirements_txt',
    'fulltopdown_failed_pipfile',
    'fulltopdown_install_failed_due_to_missing',
    'fulltopdown_install_failed_due_to_malformed',
    'fulltopdown_install_failed_due_to_python_dependency',
    'fulltopdown_install_failed_due_to_external_dependency',
    'fulltopdown_install_failed_due_to_system',
    'fulltopdown_install_failed_due_to_python_version',
    'fulltopdown_install_failed_due_to_access_error',
    'fulltopdown_install_failed_due_to_unknown',
    'fulltopdown_install_failed_combined_missing',
    'fulltopdown_did_not_found_install_failure_reason',
    'fulltopdown_notebook_read_error',
    'fullexecorder_installed_dependencies',
    'fullexecorder_failed_to_install_dependencies',
    'fullexecorder_failed_setup_py',
    'fullexecorder_failed_requirements_txt',
    'fullexecorder_failed_pipfile',
    'fullexecorder_install_failed_due_to_missing',
    'fullexecorder_install_failed_due_to_malformed',
    'fullexecorder_install_failed_due_to_python_dependency',
    'fullexecorder_install_failed_due_to_external_dependency',
    'fullexecorder_install_failed_due_to_system',
    'fullexecorder_install_failed_due_to_python_version',
    'fullexecorder_install_failed_due_to_access_error',
    'fullexecorder_install_failed_due_to_unknown',
    'fullexecorder_install_failed_combined_missing',
    'fullexecorder_did_not_found_install_failure_reason',
    'fullexecorder_notebook_read_error',
], inplace=True)
notebooks.columns.tolist()

['id',
 'select_valid_syntax',
 'select_python',
 'select_executed',
 'select_unambiguous',
 'select_original_exe_attempt_installation',
 'select_original_exe_attempt_to_run_all_cells',
 'select_execorder_attempt_to_run_all_cells',
 'select_topdown_attempt_to_run_all_cells',
 'popular',
 'has_requirements',
 'has_setup',
 'has_pipfile',
 'declares_dependencies',
 'repo_notebooks_q1',
 'repo_notebooks_q2',
 'repo_notebooks_q3',
 'repo_notebooks_q4',
 'title_untitled',
 'title_copy',
 'title_invalid_char',
 'title_valid',
 'invalid_format',
 'python_2',
 'python_3',
 'python_34',
 'python_35',
 'python_36',
 'python_37',
 'other_language',
 'note_max_exec_count_q1',
 'note_max_exec_count_q2',
 'note_max_exec_count_q3',
 'note_max_exec_count_q4',
 'note_total_cells_q1',
 'note_total_cells_q2',
 'note_total_cells_q3',
 'note_total_cells_q4',
 'note_code_cells_q1',
 'note_code_cells_q2',
 'note_code_cells_q3',
 'note_code_cells_q4',
 'note_markdown_cells_q1',
 'note_markdown_cells_q2',
 'no

In [13]:
notebooks.drop(columns=[
    "original_exe_install_failed_due_to_unknown",
    "original_exe_did_not_found_install_failure_reason",
], inplace=True)

In [14]:
notebooks

Unnamed: 0,id,select_valid_syntax,select_python,select_executed,select_unambiguous,select_original_exe_attempt_installation,select_original_exe_attempt_to_run_all_cells,select_execorder_attempt_to_run_all_cells,select_topdown_attempt_to_run_all_cells,popular,...,empty_cells,empty_cells_middle,empty_cells_end,skips,skips_middle,unordered,outputs,controlflow,imports,definitions
0,762111,True,True,True,True,False,True,True,True,False,...,0,0,0,1,0,0,1,1,1,1
1,762118,True,True,True,True,False,True,True,True,False,...,1,0,1,1,1,1,1,1,1,0
2,910065,True,True,True,True,True,False,False,False,False,...,1,0,1,1,1,1,1,1,1,0
3,525834,True,True,True,True,False,True,True,True,False,...,1,0,1,1,1,0,1,1,1,1
4,824668,True,True,True,True,False,True,True,True,False,...,0,0,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753400,21040,True,True,True,True,False,True,,,False,...,1,1,1,1,1,1,1,1,1,1
753401,21041,True,True,True,True,False,True,,,False,...,1,1,1,1,1,1,1,1,1,1
753402,21042,True,True,True,True,False,True,,,False,...,1,0,1,1,1,1,1,1,1,1
753403,21048,True,True,True,True,False,True,,,False,...,1,1,1,1,1,1,1,1,1,1


In [15]:
prefixes = {
    "original_exe_",
    "topdown_",
    "execorder_",
    "fulltopdown_",
    "fullexecorder_",
}

In [16]:
%%time

for prefix in prefixes:
    other = prefixes - {prefix}
    drop_columns = [
        column
        for column in notebooks.columns.tolist()
        for nprefix in other
        if "_" + nprefix in column
        or column.startswith(nprefix)
    ]
    current = notebooks[notebooks[prefix + "attempted_execution"].fillna(False)].drop(columns=drop_columns)
    current.columns = [
        column.replace(prefix, "exe_")
        for column in current.columns.tolist()
    ]
    print(prefix)
    with open("data/" + prefix + "notebook_features.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(current.columns.tolist()[4:])
        for i, row in current.iterrows():
            if i % 200000 == 0:
                print(i)
            result = [
                '1' if row[c] is True else ('0' if row[c] is False else row[c]) 
                for c in current.columns
                if c not in ('id', 'repository_id', 'notebook_id', 'skip', 'starforks')
            ]
            writer.writerow(result)
            #f.write(" ".join(result) + "\n")

fullexecorder_
0
400000
600000
fulltopdown_
0
400000
600000
execorder_
0
200000
400000
600000
topdown_
0
200000
400000
600000
original_exe_
0
200000
400000
600000
CPU times: user 1h 37min 24s, sys: 7.53 s, total: 1h 37min 31s
Wall time: 1h 37min 26s


After the analyses, we found the following rules:

In [17]:
def sup(x, base=notebooks):
    return len(base[x]) / len(base)

def conf(AaB, A, base):
    sup_a = sup(A, base=base)
    if sup_a == 0:
        return 0
    return sup(AaB, base=base) / sup_a
    
def lif(AaB, A, B, base):
    sup_a = sup(A, base=base)
    sup_b = sup(B, base=base)
    if sup_a == 0:
        return 0
    if sup_b == 0:
        return 0
    return sup(AaB, base=base) / (sup_a * sup_b)
    
def mining_row(mode, antecedent, consequent, base, A, B, show=True, prefix="d_"):
    AaB = A & B
    alower = antecedent.lower().replace(" ", "_").replace("\\", "").replace("{", "_").replace("}", "_")
    clower = consequent.lower().replace(" ", "_").replace("\\", "").replace("{", "_").replace("}", "_")
    mvar = "a8_rule_{}_".format(mode)
    svar = "{}_{}_support".format(alower, clower)
    cvar = "{}_{}_confidence".format(alower, clower)
    lvar = "{}_{}_lift".format(alower, clower)
    support = var(prefix + mvar + svar, sup(AaB, base=base), "{:.2%}")
    confidence = var(prefix + mvar + cvar, conf(AaB, A, base=base), "{:.2%}")
    lift_value = lif(AaB, A, B, base=base)
    lift = var(prefix+ mvar + lvar, lift_value, "{:.2f}")
    if lift_value >= 1:
        var(prefix + "timesone_" + mvar + lvar, lift_value - 1.0, "{:.0%}")
    else:
        var(prefix + "timesone_" + mvar + lvar, 1.0 - lift_value, "{:.0%}")
    result = ["\\" + mode, antecedent, consequent, support, confidence, lift]
    if show:
        print(prefix + mvar)
        display(pd.DataFrame([result], columns=["Mode", "Antecedent", "Consequent", svar, cvar, lvar]))
    return result
    
    

In [18]:
def execorder_attempted_execution(notebooks, prefix):
    rows = []
    base = notebooks[notebooks["execorder_attempted_execution"].fillna(False)]
    rows.append(mining_row(
        "tabisocounter", "unittests", "timeout", base,
        base["has_unittests"].fillna(False),
        base["execorder_timeout"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "raise", "timeout", base,
        base["ast_raise"].fillna(False),
        base["execorder_timeout"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "while", "timeout", base,
        base["ast_while"].fillna(False),
        base["execorder_timeout"].fillna(False),
        prefix=prefix
    ))
    return rows
rows = group_run(dbmt(notebooks), DBMT, execorder_attempted_execution, plot=False);

d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,unittests_timeout_support,unittests_timeout_confidence,unittests_timeout_lift
0,\tabisocounter,unittests,timeout,0.09%,28.80%,11.56


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,raise_timeout_support,raise_timeout_confidence,raise_timeout_lift
0,\tabisocounter,raise,timeout,0.17%,6.79%,2.73


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,while_timeout_support,while_timeout_confidence,while_timeout_lift
0,\tabisocounter,while,timeout,0.32%,4.21%,1.69


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,unittests_timeout_support,unittests_timeout_confidence,unittests_timeout_lift
0,\tabisocounter,unittests,timeout,0.00%,0.00%,0.0


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,raise_timeout_support,raise_timeout_confidence,raise_timeout_lift
0,\tabisocounter,raise,timeout,0.00%,0.00%,0.0


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,while_timeout_support,while_timeout_confidence,while_timeout_lift
0,\tabisocounter,while,timeout,0.00%,0.00%,0.0


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,unittests_timeout_support,unittests_timeout_confidence,unittests_timeout_lift
0,\tabisocounter,unittests,timeout,0.00%,0.00%,0.0


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,raise_timeout_support,raise_timeout_confidence,raise_timeout_lift
0,\tabisocounter,raise,timeout,0.07%,3.01%,1.62


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,while_timeout_support,while_timeout_confidence,while_timeout_lift
0,\tabisocounter,while,timeout,0.24%,3.63%,1.95


In [19]:
def topdown_attempted_execution(data, prefix):
    notebooks, rows = data
    base = notebooks[notebooks["topdown_attempted_execution"].fillna(False)]
    rows.append(mining_row(
        "tabisotd", "unittests", "timeout", base,
        base["has_unittests"].fillna(False),
        base["topdown_timeout"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "raise", "timeout", base,
        base["ast_raise"].fillna(False),
        base["topdown_timeout"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "while", "timeout", base,
        base["ast_while"].fillna(False),
        base["topdown_timeout"].fillna(False),
        prefix=prefix
    ))
group_run(zip(dbmt(notebooks), rows), DBMT, topdown_attempted_execution, plot=False);

d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,unittests_timeout_support,unittests_timeout_confidence,unittests_timeout_lift
0,\tabisotd,unittests,timeout,0.11%,35.26%,9.81


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,raise_timeout_support,raise_timeout_confidence,raise_timeout_lift
0,\tabisotd,raise,timeout,0.21%,8.50%,2.37


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,while_timeout_support,while_timeout_confidence,while_timeout_lift
0,\tabisotd,while,timeout,0.45%,5.86%,1.63


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,unittests_timeout_support,unittests_timeout_confidence,unittests_timeout_lift
0,\tabisotd,unittests,timeout,0.00%,0.00%,0.0


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,raise_timeout_support,raise_timeout_confidence,raise_timeout_lift
0,\tabisotd,raise,timeout,0.00%,0.00%,0.0


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,while_timeout_support,while_timeout_confidence,while_timeout_lift
0,\tabisotd,while,timeout,0.00%,0.00%,0.0


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,unittests_timeout_support,unittests_timeout_confidence,unittests_timeout_lift
0,\tabisotd,unittests,timeout,0.00%,0.00%,0.0


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,raise_timeout_support,raise_timeout_confidence,raise_timeout_lift
0,\tabisotd,raise,timeout,0.09%,3.76%,1.57


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,while_timeout_support,while_timeout_confidence,while_timeout_lift
0,\tabisotd,while,timeout,0.33%,5.05%,2.11


In [20]:
def timeout_rules(rows, prefix):
    timeout_rules = pd.DataFrame(rows, columns=["Mode", "Antecedent", "Consequent", "Support", "Confidence", "Lift"])

    latex = timeout_rules.to_latex(index=False, column_format="lllccc")
    latex = latex.replace("\\textbackslash ", "\\")
    lines = latex.splitlines()
    lines.insert(- 2 - 3, '\midrule')
    latex = "\n".join(lines)
    with open("outputs/{}a8_timeout_rules.tex".format(prefix), "w") as f:
        f.write(latex)
    
group_run(rows, DBMT, timeout_rules, plot=False);

In [21]:
def execorder_attempted_execution(notebooks, prefix):
    rows = []
    base = notebooks[notebooks["execorder_attempted_execution"].fillna(False)]
    rows.append(mining_row(
        "tabisocounter", "Skips in the middle", "NameError", base,
        base["skips_middle"].astype(int).astype(bool).fillna(False),
        base["execorder_exception_NameError"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "Skips", "NameError", base,
        base["skips"].astype(int).astype(bool).fillna(False),
        base["execorder_exception_NameError"].fillna(False),
        prefix=prefix
    ))
    return rows

rows = group_run(dbmt(notebooks), DBMT, execorder_attempted_execution, plot=False);

d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_nameerror_support,skips_in_the_middle_nameerror_confidence,skips_in_the_middle_nameerror_lift
0,\tabisocounter,Skips in the middle,NameError,13.41%,20.32%,1.39


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_nameerror_support,skips_nameerror_confidence,skips_nameerror_lift
0,\tabisocounter,Skips,NameError,14.21%,18.50%,1.27


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_nameerror_support,skips_in_the_middle_nameerror_confidence,skips_in_the_middle_nameerror_lift
0,\tabisocounter,Skips in the middle,NameError,16.67%,23.53%,1.41


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_nameerror_support,skips_nameerror_confidence,skips_nameerror_lift
0,\tabisocounter,Skips,NameError,16.67%,20.00%,1.2


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_nameerror_support,skips_in_the_middle_nameerror_confidence,skips_in_the_middle_nameerror_lift
0,\tabisocounter,Skips in the middle,NameError,6.97%,15.04%,1.87


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_nameerror_support,skips_nameerror_confidence,skips_nameerror_lift
0,\tabisocounter,Skips,NameError,7.59%,13.78%,1.71


In [22]:
def topdown_attempted_execution(data, prefix):
    notebooks, rows = data
    base = notebooks[notebooks["topdown_attempted_execution"].fillna(False)]
    rows.append(mining_row(
        "tabisotd", "Skips in the middle", "NameError", base,
        base["skips_middle"].astype(int).astype(bool).fillna(False),
        base["topdown_exception_NameError"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "Skips", "NameError", base,
        base["skips"].astype(int).astype(bool).fillna(False),
        base["topdown_exception_NameError"].fillna(False),
        prefix=prefix
    ))

group_run(zip(dbmt(notebooks), rows), DBMT, topdown_attempted_execution, plot=False);

d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_nameerror_support,skips_in_the_middle_nameerror_confidence,skips_in_the_middle_nameerror_lift
0,\tabisotd,Skips in the middle,NameError,3.05%,4.62%,1.18


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_nameerror_support,skips_nameerror_confidence,skips_nameerror_lift
0,\tabisotd,Skips,NameError,3.56%,4.63%,1.19


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_nameerror_support,skips_in_the_middle_nameerror_confidence,skips_in_the_middle_nameerror_lift
0,\tabisotd,Skips in the middle,NameError,0.00%,0.00%,0.0


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_nameerror_support,skips_nameerror_confidence,skips_nameerror_lift
0,\tabisotd,Skips,NameError,0.00%,0.00%,0.0


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_nameerror_support,skips_in_the_middle_nameerror_confidence,skips_in_the_middle_nameerror_lift
0,\tabisotd,Skips in the middle,NameError,1.77%,3.83%,1.46


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_nameerror_support,skips_nameerror_confidence,skips_nameerror_lift
0,\tabisotd,Skips,NameError,2.18%,3.95%,1.51


In [23]:
def skip_nameerror_rules(rows, prefix):
    skip_nameerror = pd.DataFrame(rows, columns=["Mode", "Antecedent", "Consequent", "Support", "Confidence", "Lift"])

    latex = skip_nameerror.to_latex(index=False, column_format="lllccc")
    latex = latex.replace("\\textbackslash ", "\\")
    lines = latex.splitlines()
    lines.insert(- 2 - 2, '\midrule')
    latex = "\n".join(lines)
    with open("outputs/{}a8_skip_nameerror_rules.tex".format(prefix), "w") as f:
        f.write(latex)

group_run(rows, DBMT, skip_nameerror_rules, plot=False);

In [24]:
def execorder_attempted_execution(notebooks, prefix):
    rows = []
    base = notebooks[notebooks["execorder_attempted_execution"].fillna(False)]
    rows.append(mining_row(
        "tabisocounter", "\\tabq{9 or less cells}{1st}", "\\tabsameec", base,
        base["note_total_cells_q1"].fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "while", "\\tabsameec", base,
        base["ast_while"].fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "class", "\\tabsameec", base,
        base["ast_classdef"].fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "Skips in the middle", "\\tabsameec", base,
        base["skips_middle"].astype(int).astype(bool).fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "Imports", "\\tabsameec", base,
        base["ast_import"].fillna(False) | base["ast_importfrom"].fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "Unordered", "\\tabsameec", base,
        base["unordered"].astype(int).astype(bool).fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisocounter", "\\tabq{37 or more cells}{4th}", "\\tabsameec", base,
        base["note_total_cells_q4"].fillna(False),
        base["execorder_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    return rows

rows = group_run(dbmt(notebooks), DBMT, execorder_attempted_execution, plot=False);

d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_9_or_less_cells__1st__tabsameec_support,tabq_9_or_less_cells__1st__tabsameec_confidence,tabq_9_or_less_cells__1st__tabsameec_lift
0,\tabisocounter,\tabq{9 or less cells}{1st},\tabsameec,3.84%,14.47%,2.22


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,while_tabsameec_support,while_tabsameec_confidence,while_tabsameec_lift
0,\tabisocounter,while,\tabsameec,0.91%,11.95%,1.83


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,class_tabsameec_support,class_tabsameec_confidence,class_tabsameec_lift
0,\tabisocounter,class,\tabsameec,0.88%,10.94%,1.67


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_tabsameec_support,skips_in_the_middle_tabsameec_confidence,skips_in_the_middle_tabsameec_lift
0,\tabisocounter,Skips in the middle,\tabsameec,2.77%,4.19%,0.64


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,imports_tabsameec_support,imports_tabsameec_confidence,imports_tabsameec_lift
0,\tabisocounter,Imports,\tabsameec,3.66%,4.11%,0.63


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,unordered_tabsameec_support,unordered_tabsameec_confidence,unordered_tabsameec_lift
0,\tabisocounter,Unordered,\tabsameec,0.80%,2.23%,0.34


d_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_37_or_more_cells__4th__tabsameec_support,tabq_37_or_more_cells__4th__tabsameec_confidence,tabq_37_or_more_cells__4th__tabsameec_lift
0,\tabisocounter,\tabq{37 or more cells}{4th},\tabsameec,0.44%,1.98%,0.3


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_9_or_less_cells__1st__tabsameec_support,tabq_9_or_less_cells__1st__tabsameec_confidence,tabq_9_or_less_cells__1st__tabsameec_lift
0,\tabisocounter,\tabq{9 or less cells}{1st},\tabsameec,4.17%,10.00%,2.4


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,while_tabsameec_support,while_tabsameec_confidence,while_tabsameec_lift
0,\tabisocounter,while,\tabsameec,0.00%,0.00%,0.0


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,class_tabsameec_support,class_tabsameec_confidence,class_tabsameec_lift
0,\tabisocounter,class,\tabsameec,0.00%,0.00%,0.0


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_tabsameec_support,skips_in_the_middle_tabsameec_confidence,skips_in_the_middle_tabsameec_lift
0,\tabisocounter,Skips in the middle,\tabsameec,4.17%,5.88%,1.41


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,imports_tabsameec_support,imports_tabsameec_confidence,imports_tabsameec_lift
0,\tabisocounter,Imports,\tabsameec,4.17%,5.00%,1.2


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,unordered_tabsameec_support,unordered_tabsameec_confidence,unordered_tabsameec_lift
0,\tabisocounter,Unordered,\tabsameec,0.00%,0.00%,0.0


sd_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_37_or_more_cells__4th__tabsameec_support,tabq_37_or_more_cells__4th__tabsameec_confidence,tabq_37_or_more_cells__4th__tabsameec_lift
0,\tabisocounter,\tabq{37 or more cells}{4th},\tabsameec,0.00%,0.00%,0.0


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_9_or_less_cells__1st__tabsameec_support,tabq_9_or_less_cells__1st__tabsameec_confidence,tabq_9_or_less_cells__1st__tabsameec_lift
0,\tabisocounter,\tabq{9 or less cells}{1st},\tabsameec,4.26%,19.32%,1.77


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,while_tabsameec_support,while_tabsameec_confidence,while_tabsameec_lift
0,\tabisocounter,while,\tabsameec,0.50%,7.71%,0.71


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,class_tabsameec_support,class_tabsameec_confidence,class_tabsameec_lift
0,\tabisocounter,class,\tabsameec,0.45%,5.12%,0.47


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_tabsameec_support,skips_in_the_middle_tabsameec_confidence,skips_in_the_middle_tabsameec_lift
0,\tabisocounter,Skips in the middle,\tabsameec,3.89%,8.39%,0.77


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,imports_tabsameec_support,imports_tabsameec_confidence,imports_tabsameec_lift
0,\tabisocounter,Imports,\tabsameec,8.02%,8.85%,0.81


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,unordered_tabsameec_support,unordered_tabsameec_confidence,unordered_tabsameec_lift
0,\tabisocounter,Unordered,\tabsameec,1.25%,5.87%,0.54


td_a8_rule_tabisocounter_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_37_or_more_cells__4th__tabsameec_support,tabq_37_or_more_cells__4th__tabsameec_confidence,tabq_37_or_more_cells__4th__tabsameec_lift
0,\tabisocounter,\tabq{37 or more cells}{4th},\tabsameec,0.89%,3.72%,0.34


In [25]:
def topdown_attempted_execution(data, prefix):
    notebooks, rows = data
    base = notebooks[notebooks["topdown_attempted_execution"].fillna(False)]
    rows.append(mining_row(
        "tabisotd", "\\tabq{9 or less cells}{1st}", "\\tabsameec", base,
        base["note_total_cells_q1"].fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "while", "\\tabsameec", base,
        base["ast_while"].fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "class", "\\tabsameec", base,
        base["ast_classdef"].fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "Skips in the middle", "\\tabsameec", base,
        base["skips_middle"].astype(int).astype(bool).fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "Imports", "\\tabsameec", base,
        base["ast_import"].fillna(False) | base["ast_importfrom"].fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "Unordered", "\\tabsameec", base,
        base["unordered"].astype(int).astype(bool).fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))
    rows.append(mining_row(
        "tabisotd", "\\tabq{37 or more cells}{4th}", "\\tabsameec", base,
        base["note_total_cells_q4"].fillna(False),
        base["topdown_execution_count_same_results"].fillna(False),
        prefix=prefix
    ))


group_run(zip(dbmt(notebooks), rows), DBMT, topdown_attempted_execution, plot=False);

d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_9_or_less_cells__1st__tabsameec_support,tabq_9_or_less_cells__1st__tabsameec_confidence,tabq_9_or_less_cells__1st__tabsameec_lift
0,\tabisotd,\tabq{9 or less cells}{1st},\tabsameec,4.00%,15.09%,2.07


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,while_tabsameec_support,while_tabsameec_confidence,while_tabsameec_lift
0,\tabisotd,while,\tabsameec,1.00%,13.16%,1.81


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,class_tabsameec_support,class_tabsameec_confidence,class_tabsameec_lift
0,\tabisotd,class,\tabsameec,0.96%,12.02%,1.65


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_tabsameec_support,skips_in_the_middle_tabsameec_confidence,skips_in_the_middle_tabsameec_lift
0,\tabisotd,Skips in the middle,\tabsameec,3.48%,5.28%,0.73


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,imports_tabsameec_support,imports_tabsameec_confidence,imports_tabsameec_lift
0,\tabisotd,Imports,\tabsameec,4.29%,4.81%,0.66


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,unordered_tabsameec_support,unordered_tabsameec_confidence,unordered_tabsameec_lift
0,\tabisotd,Unordered,\tabsameec,1.55%,4.31%,0.59


d_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_37_or_more_cells__4th__tabsameec_support,tabq_37_or_more_cells__4th__tabsameec_confidence,tabq_37_or_more_cells__4th__tabsameec_lift
0,\tabisotd,\tabq{37 or more cells}{4th},\tabsameec,0.59%,2.62%,0.36


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_9_or_less_cells__1st__tabsameec_support,tabq_9_or_less_cells__1st__tabsameec_confidence,tabq_9_or_less_cells__1st__tabsameec_lift
0,\tabisotd,\tabq{9 or less cells}{1st},\tabsameec,4.17%,10.00%,1.2


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,while_tabsameec_support,while_tabsameec_confidence,while_tabsameec_lift
0,\tabisotd,while,\tabsameec,4.17%,50.00%,6.0


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,class_tabsameec_support,class_tabsameec_confidence,class_tabsameec_lift
0,\tabisotd,class,\tabsameec,0.00%,0.00%,0.0


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_tabsameec_support,skips_in_the_middle_tabsameec_confidence,skips_in_the_middle_tabsameec_lift
0,\tabisotd,Skips in the middle,\tabsameec,8.33%,11.76%,1.41


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,imports_tabsameec_support,imports_tabsameec_confidence,imports_tabsameec_lift
0,\tabisotd,Imports,\tabsameec,4.17%,5.00%,0.6


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,unordered_tabsameec_support,unordered_tabsameec_confidence,unordered_tabsameec_lift
0,\tabisotd,Unordered,\tabsameec,4.17%,14.29%,1.71


sd_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_37_or_more_cells__4th__tabsameec_support,tabq_37_or_more_cells__4th__tabsameec_confidence,tabq_37_or_more_cells__4th__tabsameec_lift
0,\tabisotd,\tabq{37 or more cells}{4th},\tabsameec,4.17%,12.50%,1.5


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_9_or_less_cells__1st__tabsameec_support,tabq_9_or_less_cells__1st__tabsameec_confidence,tabq_9_or_less_cells__1st__tabsameec_lift
0,\tabisotd,\tabq{9 or less cells}{1st},\tabsameec,4.30%,19.50%,1.73


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,while_tabsameec_support,while_tabsameec_confidence,while_tabsameec_lift
0,\tabisotd,while,\tabsameec,0.53%,8.24%,0.73


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,class_tabsameec_support,class_tabsameec_confidence,class_tabsameec_lift
0,\tabisotd,class,\tabsameec,0.46%,5.19%,0.46


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,skips_in_the_middle_tabsameec_support,skips_in_the_middle_tabsameec_confidence,skips_in_the_middle_tabsameec_lift
0,\tabisotd,Skips in the middle,\tabsameec,4.24%,9.16%,0.81


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,imports_tabsameec_support,imports_tabsameec_confidence,imports_tabsameec_lift
0,\tabisotd,Imports,\tabsameec,8.33%,9.19%,0.81


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,unordered_tabsameec_support,unordered_tabsameec_confidence,unordered_tabsameec_lift
0,\tabisotd,Unordered,\tabsameec,1.60%,7.52%,0.67


td_a8_rule_tabisotd_


Unnamed: 0,Mode,Antecedent,Consequent,tabq_37_or_more_cells__4th__tabsameec_support,tabq_37_or_more_cells__4th__tabsameec_confidence,tabq_37_or_more_cells__4th__tabsameec_lift
0,\tabisotd,\tabq{37 or more cells}{4th},\tabsameec,0.97%,4.05%,0.36


In [26]:
def same_results_rules(rows, prefix):
    same_results = pd.DataFrame(rows, columns=["Mode", "Antecedent", "Consequent", "Support", "Confidence", "Lift"])

    latex = same_results.to_latex(index=False, column_format="lllccc")
    latex = latex.replace("\\textbackslash ", "\\").replace("\\{", "{").replace("\\}", "}")
    lines = latex.splitlines()
    lines.insert(- 2 - 7, '\midrule')
    latex = "\n".join(lines)
    with open("outputs/{}a8_same_results_rules.tex".format(prefix), "w") as f:
        f.write(latex)

group_run(rows, DBMT, same_results_rules, plot=False);

The extracted rules refer to the Isolated + Exec. Counter and Isolated + Top-Down modes