In [1]:
import sqlite3
from tqdm import tqdm
from datetime import datetime
import yaml
import os

import util

In [2]:
mappings = dict()
for file in os.scandir('../mappings'):
    if not file.name.endswith('.yaml'):
        continue
    with open(file, 'r') as fin:
        obj = yaml.safe_load(fin)
        mappings.update(util.update_names(obj))
mappings_extra = dict()
for file in os.scandir('../mappings/other'):
    with open(file, 'r') as fin:
        obj = yaml.safe_load(fin)
        mappings_extra.update(obj)
util.set_mappings(mappings, mappings_extra)

In [4]:
conn = sqlite3.connect('diff_v2.sqlite3', isolation_level='EXCLUSIVE')
cur = conn.cursor()

TOTAL = 22004103
TIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"

cur.execute("SELECT Timestamp, Value, Device, ObjectType, ObjectId, Property FROM FullValues WHERE Property NOT IN ('presentValue', 'priorityArray')")
full_data = dict()
with tqdm(total=TOTAL) as pbar:
    while True:
        elem = cur.fetchone()
        if elem is None:
            break
        pbar.update(1)
        time, value, key = elem[0], elem[1], elem[2:]
        time = datetime.strptime(time, TIME_FORMAT)
        if key not in full_data:
            full_data[key] = []
        full_data[key].append((time, value))
conn.close()

 45%|██████████████████████████████▍                                     | 9855243/22004103 [02:51<03:31, 57513.53it/s]


In [5]:
for key in full_data:
    full_data[key].sort()

In [17]:
def count_quick_reverts(key, data):
    global mappings
    QUICK_REV_THRES = 11
    QUICK_REV_TIME = 7200
    
    _, obj, _, prop = key    
    feat_type = 'atomic'
    if prop in mappings[obj]:
        full_type = mappings[obj][prop]['Type']
        if type(full_type) == dict:
            if full_type['Name'] == 'list':
                feat_type = 'list'
            elif full_type['Name'] == 'array':
                feat_type = 'array'
                array_size = full_type['Length']
            elif full_type['Name'] == 'object':
                feat_type = 'object'
    else:
        return 0
    quick_count = 0
    for i in range(len(feat_data) - 1):
        time_1, val_1 = feat_data[i]
        time_2, val_2 = feat_data[i + 1]
        if val_1 != val_2 and (time_2 - time_1).total_seconds() <= QUICK_REV_TIME:
            if feat_type == 'list':
                quick_count += 1
            elif feat_type == 'array':
                try:
                    val_1 = util.parse_array(val_1, array_size)
                except:
                    val_1 = ['Fail'] * array_size
                try:
                    val_2 = util.parse_array(val_2, array_size)
                except:
                    val_2 = ['Fail'] * array_size
                for ind in range(array_size):
                    try:
                        if val_1[ind] != val_2[ind]:
                            quick_count += 1
                    except Exception as e:
                        print(key)
                        print(val_1)
                        print(val_2)
                        print(array_size)
                        raise e
            else:
                quick_count += 1
    if quick_count < QUICK_REV_THRES:
        return 0
    count = 0
    last_time = None
    last_val = None
    for i in range(len(feat_data) - 1):
        time_1 = feat_data[i][0]
        time_2 = feat_data[i + 1][0]
        if (time_2 - time_1).total_seconds() <= QUICK_REV_TIME:
            count += 1
            continue
        if last_val == feat_data[i][1]:
            count += 1
        else:
            last_val = feat_data[i][1]
    return count

In [18]:
def count_meaningless(key, data):
    _, obj, _, prop = key
    if False and prop in ['presentValue', 'priorityArray']:
        return len(data)
    if prop == 'reliability':
        prev = None
        cnt_good = 0
        for i in range(len(data) - 1):
            elem = data[i][1]
            if elem in ['noFaultDetected', 'communicationFailure']:
                elem = 'normal'
            if elem != prev:
                prev = elem
                cnt_good += 1
        return len(data) - cnt_good
    if prop == 'statusFlags':
        prev = None
        cnt_good = 0
        for i in range(len(data) - 1):
            elem = util.parse_array(data[i][1], 4)[2:]
            if elem != prev:
                prev = elem
                cnt_good += 1
        return len(data) - cnt_good
    return 0
                

In [19]:
def number_check(val):
    try:
        float(val)
        return True
    except:
        return False

def count_illegal(key, data):
    global mappings
    
    _, obj, _, prop = key
    if prop in mappings[obj]:
        typ = mappings[obj][prop]['Type']
        if type(typ) == dict:
            typ = typ['Name']
    else:
        return 0
    
    if typ in ['list', 'array']:
        pred = lambda x: type(x) == list
    elif typ == 'object':
        pred = lambda x: len(x.split('\n')) > 1
    elif typ == 'number':
        pred = number_check
    elif typ == 'bool':
        if len(set(map(lambda x: x[1], data))) <= 2:
            return 0
        return len(data)
    elif typ in ['invalid', 'other', 'object ref']:
        return 0
    else:
        print('Unknown type:', typ)
        return 0
    cnt = 0
    for elem in data:
        if elem[1] is None or pred(elem[1]):
            continue
        cnt += 1
    return cnt

In [20]:
removed_cnt = 0
for key in tqdm(full_data):
    feat_data = full_data[key]
    removed_cnt += count_quick_reverts(key, feat_data)

100%|████████████████████████████████████████████████████████████████████| 5042270/5042270 [00:05<00:00, 973586.17it/s]


In [21]:
print(removed_cnt)

7587
