In [1]:
import pandas
import IPython.display

from raha import raha

## Error Detection with Raha

In [2]:
app_1 = raha.Detection()

# How many tuples would you label?
app_1.LABELING_BUDGET = 20

# Would you like to see the logs?
app_1.VERBOSE = True
app_1.SAVE_RESULTS = True

In [4]:
dataset_dictionary = {
    "name": "iot",
    "path": "../../data/iot-scenario/error-injector/iot-scenario-dirty.csv",
    "clean_path": "../../data/iot-scenario/error-injector/iot-scenario-data.csv"
}
d = app_1.initialize_dataset(dataset_dictionary)
d.dataframe.head()

Unnamed: 0,System,Device,SensingDevice,Sensor,name,value,timestamp,location
0,MA TestSystem,device_out,aqara_multisensor_2,aqara_temp_2,t4,7.080000000000001,2021-01-01 02:00:00+01:00,outside
1,MA TestSystem,device_in_1,esp8266_1,ds18b20_1,t1,22.69,2021-01-01 02:00:00+01:00,room1
2,MA TestSystem,vfbkee_im_2,aqara_multisensor_1,aqara_temp_1,t3,22.32333333333333,2021-01-01 02:00:00+01:00,room2
3,MA TestSystem,device_in_1,esp8266_2,ds18b20_2,t2,23.69,2021-01-01 02:00:00+01:00,room1
4,MA TestSystem,device_main,raspberry,,,22.32333333333333,2021-01-01 02:00:05+01:00,room2


In [6]:
app_1.run_strategies(d)

I just load strategies' results as they have already been run on the dataset!


107 strategy profiles are collected.


In [7]:
app_1.generate_features(d)

12 Features are generated for column 0.
18 Features are generated for column 1.
19 Features are generated for column 2.
27 Features are generated for column 3.
8 Features are generated for column 4.
12 Features are generated for column 5.
8 Features are generated for column 6.
10 Features are generated for column 7.


In [8]:
app_1.build_clusters(d)

A hierarchical clustering model is built for column 0.
A hierarchical clustering model is built for column 1.
A hierarchical clustering model is built for column 2.
A hierarchical clustering model is built for column 3.
A hierarchical clustering model is built for column 4.
A hierarchical clustering model is built for column 5.
A hierarchical clustering model is built for column 6.
A hierarchical clustering model is built for column 7.


In [10]:
while len(d.labeled_tuples) < app_1.LABELING_BUDGET:
    app_1.sample_tuple(d)
    if d.has_ground_truth:
        app_1.label_with_ground_truth(d)
    else:
        print("Label the dirty cells in the following sampled tuple.")
        sampled_tuple = pandas.DataFrame(data=[d.dataframe.iloc[d.sampled_tuple, :]], columns=d.dataframe.columns)
        IPython.display.display(sampled_tuple)
        for j in range(d.dataframe.shape[1]):
            cell = (d.sampled_tuple, j)
            value = d.dataframe.iloc[cell]
            correction = input("What is the correction for value '{}'? Type in the same value if it is not erronous.\n".format(value))
            user_label = 1 if value != correction else 0
            d.labeled_cells[cell] = [user_label, correction]
        d.labeled_tuples[d.sampled_tuple] = 1

Tuple 502 is sampled.
Tuple 502 is labeled.
Tuple 50 is sampled.
Tuple 50 is labeled.
Tuple 921 is sampled.
Tuple 921 is labeled.
Tuple 128 is sampled.
Tuple 128 is labeled.
Tuple 696 is sampled.
Tuple 696 is labeled.
Tuple 786 is sampled.
Tuple 786 is labeled.
Tuple 279 is sampled.
Tuple 279 is labeled.
Tuple 516 is sampled.
Tuple 516 is labeled.
Tuple 845 is sampled.
Tuple 845 is labeled.
Tuple 671 is sampled.
Tuple 671 is labeled.
Tuple 916 is sampled.
Tuple 916 is labeled.
Tuple 160 is sampled.
Tuple 160 is labeled.
Tuple 39 is sampled.
Tuple 39 is labeled.
Tuple 404 is sampled.
Tuple 404 is labeled.
Tuple 861 is sampled.
Tuple 861 is labeled.
Tuple 536 is sampled.
Tuple 536 is labeled.
Tuple 569 is sampled.
Tuple 569 is labeled.
Tuple 622 is sampled.
Tuple 622 is labeled.
Tuple 375 is sampled.
Tuple 375 is labeled.
Tuple 288 is sampled.
Tuple 288 is labeled.


In [11]:
app_1.propagate_labels(d)

The number of labeled data cells increased from 160 to 4839.


In [12]:
app_1.predict_labels(d)

A classifier is trained and applied on column 0.
A classifier is trained and applied on column 1.
A classifier is trained and applied on column 2.
A classifier is trained and applied on column 3.
A classifier is trained and applied on column 4.
A classifier is trained and applied on column 5.
A classifier is trained and applied on column 6.
A classifier is trained and applied on column 7.


In [13]:
app_1.store_results(d)

The results are stored in ../../data/iot-scenario/error-injector\raha-baran-results-iot\error-detection\detection.dataset.


In [14]:
p, r, f = d.get_data_cleaning_evaluation(d.detected_cells)[:3]
print("Raha's performance on {}:\nPrecision = {:.2f}\nRecall = {:.2f}\nF1 = {:.2f}".format(d.name, p, r, f))

Raha's performance on iot:
Precision = 1.00
Recall = 0.09
F1 = 0.16
