In [21]:
import logging
logging.basicConfig()
logger = logging.getLogger("stlarm")
logger.setLevel(logging.DEBUG)

from stlarm import analyzer
from stlarm.metarule_viz import print_metarule

In [11]:
import pandas as pd
pd.set_option("display.max_colwidth", None)

In [12]:
RULES_FILE = "../mining/mined_rules.tsv"
TRAIN_KB_FILE = "../resources/converted_nyc_foursquare_period_2_2_None_filtered_triples.nt"

In [13]:
ra = analyzer.RulesAnalyzer(RULES_FILE, 
                            TRAIN_KB_FILE, 
                            remove_reflexive_duplicates=True, 
                            metarules=True)

INFO:stlarm.analyzer:Loaded 459256 rules.
INFO:stlarm.analyzer:Improving rules...
INFO:stlarm.analyzer:Removing rules with unsorted reflexive relations...
INFO:stlarm.analyzer:Removed 148294 rules with unsorted reflexive relations. 310962 rules will be used.
DEBUG:stlarm.analyzer:Calling java -cp /home/jovyan/work/amie/amie3.jar amie.rules.eval.MetaruleBuilder /tmp/tmp8718j03x
INFO:stlarm.analyzer:[Done] amie.rules.eval.MetaruleBuilder
DEBUG:stlarm.analyzer:Building meta-rules lattice using amie.MetaruleBuilder took 58.41s.
INFO:stlarm.analyzer:Added 'Metarule' column. There are 1329 metarules.


In [14]:
def get_rules_without_relation(df, relation):
    return df[~df['Rule'].str.contains(relation, regex=False)]

ra_no_before_df = get_rules_without_relation(ra._raw_df, 'before')
ra_no_before = analyzer.RulesAnalyzer.from_analyzer(ra, ra_no_before_df, metarules=True)

print(ra.df['Metarule'].nunique(),
      ra_no_before.df['Metarule'].nunique())

print(ra.df['Rule'].nunique(),
      ra_no_before.df['Rule'].nunique())

DEBUG:stlarm.analyzer:Using AMIE_JAR_PATH /home/jovyan/work/amie/amie3.jar
INFO:stlarm.analyzer:Loaded 211908 rules.
DEBUG:stlarm.analyzer:Calling java -cp /home/jovyan/work/amie/amie3.jar amie.rules.eval.MetaruleBuilder /tmp/tmpw9wqnzvq
INFO:stlarm.analyzer:[Done] amie.rules.eval.MetaruleBuilder
DEBUG:stlarm.analyzer:Building meta-rules lattice using amie.MetaruleBuilder took 17.98s.
INFO:stlarm.analyzer:Added 'Metarule' column. There are 370 metarules.


1329 370
310962 211908


In [470]:
import ipywidgets as widgets
import sys
from IPython.display import display
from IPython.display import clear_output

# https://github.com/jupyter-widgets/ipywidgets/issues/2487#issuecomment-510721436
def create_multipleChoice_widget(df, idx, metarule, options_interesting, options_non_interesting):
    alternatives_int = widgets.SelectMultiple(
        options=options_interesting,
        disabled = False,
        description="Interesting labels",
        rows=len(options_interesting)
    )
    alternatives_non_int = widgets.SelectMultiple(
        options=options_non_interesting,
        disabled = False,
        description="Non interesting labels",
        rows=len(options_non_interesting)
    )
    rule_description = widgets.Textarea(
        value='',
        placeholder='Example: shows that some users are more likely to checkin withinTimeWindow at places withinRadius',
        description='Rule description:',
        disabled=False,
        layout={'width': '100%'}
    )
    
    description_out = widgets.Output()
    with description_out:
        print('rule index:', idx)
        print_metarule(df, metarule)
        if metarule in annotation_results:
            print("curr annotations:", annotation_results[metarule])
            alternatives_int.value = [x for x in options_interesting if x in annotation_results[metarule][0]]
            alternatives_non_int.value = [x for x in options_non_interesting if x in annotation_results[metarule][0]]
            rule_description.value = annotation_results[metarule][1]
        
    feedback_out = widgets.Output()

    def save_selections(b):
        with feedback_out:
            clear_output()
            marked = alternatives_int.value + alternatives_non_int.value
            rule_desc = rule_description.value
            print('saving: ')
            print(metarule, 'is', marked, sep='\n')
            print('and means', rule_desc, sep='\n')
            annotation_results[metarule] = (marked, rule_desc)
        return
    
    check = widgets.Button(description="save")
    check.on_click(save_selections)
    
    return widgets.VBox([description_out, widgets.HBox([alternatives_int, alternatives_non_int]), rule_description, check, feedback_out])

DEBUG:stlarm.analyzer:Using AMIE_JAR_PATH /home/jovyan/work/amie/amie3.jar


In [963]:
options_int = [
    'potentially interesting',
    'interesting', 
]
options_non_int = [
    'too general', # hasCheckin(?b, ?f), hasPOI(?l, CONST), withinTimeWindow(?f, ?l) => hasTrajectory(CONST2, ?b)
    'different variables, but same binding', # hasCheckin(?a, ?n), hasCheckin(?g, ?n)
    'describes the ontology itself', # hasCheckin(?a, ?h), withinRadius(?b, ?h) => hasCheckin(?a, ?b)
    'indirect rule', # hasCheckin(?a, ?b), withinRadius(?b, ?c), hasTime(?c, Morning) => hasTrajectoryCategory(?a, Weekday)
    'spurious correlations',
    'other - uninteresting',
    'other - should not be mined',
]

DEBUG:stlarm.analyzer:Using AMIE_JAR_PATH /home/jovyan/work/amie/amie3.jar


In [964]:
def display_rule(df, idx, metarules):
    wid = create_multipleChoice_widget(df, idx, metarules[idx], options_int, options_non_int) 
    display(wid)

In [1437]:
_curr_rule_idx = 0
std_sorted_metarules = ra_no_before.df.groupby('Metarule')['Std Confidence'].max().sort_values(ascending=False)
#annotation_results = {}
len(std_sorted_metarules), len(annotation_results)

(370, 370)

-------

In [1567]:
display_rule(ra_no_before.df, _curr_rule_idx, std_sorted_metarules.index)
_curr_rule_idx += 1
len(annotation_results), _curr_rule_idx-1

IndexError: index 370 is out of bounds for axis 0 with size 370

In [1312]:
#print_metarule(ra_no_before.df, std_sorted_metarules.index[_curr_rule_idx-2])

-------------------------------------

In [1572]:
annotations_df = pd.DataFrame.from_dict(annotation_results, 'index', columns=['label', 'desc'])
annotations_df = annotations_df.drop(columns='label').join(annotations_df['label'].str.join('|').str.get_dummies())
annotations_df = annotations_df.reset_index().rename(columns={'index': 'Metarule'})
annotations_df.head()

Unnamed: 0,Metarule,desc,describes the ontology itself,"different variables, but same binding",indirect rule,interesting,other - uninteresting,potentially interesting,spurious correlations,too general
0,?a <hasCheckin> ?n ?g <hasCheckin> ?n ?g <hasMonth> ?b => ?a <hasMonth> ?b,,0,1,0,0,0,0,0,0
1,?e <hasCheckin> ?a ?e <hasCheckin> ?l ?l <hasPOI> ?z0 => ?a <hasTime> ?z1,,0,0,0,0,0,0,0,1
2,?g <hasPOI> ?z0 ?g <hasTime> ?b ?a <withinTimeWindow> ?g => ?a <hasTime> ?b,,0,0,0,0,0,0,0,1
3,?l <hasPOI> ?z0 ?a <withinTimeWindow> ?f ?f <withinTimeWindow> ?l => ?a <hasTime> ?z1,,0,0,0,0,0,0,0,1
4,?a <hasCheckin> ?f ?f <hasPOI> ?z0 ?f <hasTime> ?z1 => ?a <hasTrajectoryCategory> ?z2,,0,0,0,1,0,0,0,0


In [None]:
annotations_df.to_csv('rules_manual_eval.csv')

In [1575]:
import numpy as np
def rule_is(df, columns):
    # https://stackoverflow.com/a/57822942
    if not isinstance(columns, list):
        columns = [columns]
    return df[np.logical_or.reduce([
            df[column] == 1
            for column in columns
        ])
    ]

def df_diff(df1, others):
    idxs = df1.index
    for other in others:
        idxs = idxs.difference(other.index)
    return df1.loc[idxs]

should_not_happen = rule_is(annotations_df, ['different variables, but same binding', 
                                'indirect rule',
                               'other - should not be mined'
                               ])
not_interesting = rule_is(df_diff(annotations_df, [should_not_happen]), ['spurious correlations', 'describes the ontology itself', 'other - uninteresting'])
too_general = rule_is(df_diff(annotations_df, [not_interesting, should_not_happen]), ['too general'])

more_or_less = rule_is(df_diff(annotations_df, [not_interesting, should_not_happen, too_general]), 'potentially interesting')
interesting = rule_is(df_diff(annotations_df, [not_interesting, should_not_happen, too_general, more_or_less]), 'interesting')


categories = [interesting, more_or_less, too_general, not_interesting, should_not_happen]
[len(x) for x in categories], sum([len(x) for x in categories]), len(annotations_df)

([34, 117, 131, 71, 17], 370, 370)

In [1577]:
label_names = ['interesting', 'more or less', 'too general', 'uninteresting', 'should not happen']
labels = pd.DataFrame(False, index=x['Metarule'], columns=['classification'])

for lbl, cat in zip(label_names, categories):
    labels.loc[cat['Metarule']] = lbl
labels.head()

Unnamed: 0_level_0,classification
Metarule,Unnamed: 1_level_1
?a <hasCheckin> ?n ?g <hasCheckin> ?n ?g <hasMonth> ?b => ?a <hasMonth> ?b,should not happen
?e <hasCheckin> ?a ?e <hasCheckin> ?l ?l <hasPOI> ?z0 => ?a <hasTime> ?z1,too general
?g <hasPOI> ?z0 ?g <hasTime> ?b ?a <withinTimeWindow> ?g => ?a <hasTime> ?b,too general
?l <hasPOI> ?z0 ?a <withinTimeWindow> ?f ?f <withinTimeWindow> ?l => ?a <hasTime> ?z1,too general
?a <hasCheckin> ?f ?f <hasPOI> ?z0 ?f <hasTime> ?z1 => ?a <hasTrajectoryCategory> ?z2,interesting


In [1580]:
labels['classification'].value_counts()

too general          131
more or less         117
uninteresting         71
interesting           34
should not happen     17
Name: classification, dtype: int64

In [1578]:
labels.to_csv('labeled_metarules.csv')