# Creating and Viewing Diffs #1

In [1]:
import pandas as pd

from pathlib import Path

In [2]:
PATH_DATA = Path('../data/test2')
PATHS = [PATH_DATA / f'{name}.csv' for name in ('v1', 'v2')]

In [3]:
from dacman_csv import TableDiffer, TableValuesDiff

In [4]:
differ = TableDiffer.from_config_py(PATH_DATA / 'config-2.py')

In [5]:
builder = differ.get_builder(*PATHS)
builder.data[0]

Unnamed: 0_level_0,site_id,measurement_date,temperature,pressure,humidity,comment
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1258,Alpha,2018-04-03,103.7,1056.6,0.14,Donec posuere metus vitae ipsum.
1482,Alpha,2018-11-16,67.1,994.6,0.314,Aliquam sit amet diam in magna bibendum imperd...
1735,Echo,2018-12-05,84.8,1105.2,0.32,
2066,Alpha,2019-01-29,105.6,958.1,0.29,Nulla nisl.
2502,Echo,2018-06-06,60.3,1122.0,0.14,Cras in purus eu magna vulputate luctus.
2926,Charlie,2019-01-04,62.1,1080.4,0.808,
2948,Charlie,2018-09-11,106.2,1092.8,0.629,
3639,Alpha,2019-03-15,85.8,1026.8,0.351,
4080,Charlie,2018-09-30,90.6,889.1,0.757,"Donec odio justo, sollicitudin ut, suscipit a,..."
4311,Alpha,2019-03-04,73.7,1058.0,0.132,


In [6]:
builder.build_columns(differ.comparators)

processing 'site_id'
processing 'measurement_date'
processing 'temperature'
processing 'pressure'
processing 'humidity'
processing 'comment'


In [7]:
diff_stack = builder.get_diff_stack()
diff_stack

<dacman_csv.TableDiffStack at 0x7fafc02a88d0>

In [8]:
diff_stack['status'].stack().sort_values()[lambda d: d == 'D'].index.to_list()

[(2948, 'site_id'),
 (2948, 'temperature'),
 (2948, 'pressure'),
 (1258, 'pressure'),
 (2948, 'humidity'),
 (2948, 'measurement_date')]

In [9]:
values_diff = TableValuesDiff(diff_stack)
values_diff.to_record()

{'added': [(2949, 'site_id'),
  (2949, 'measurement_date'),
  (2949, 'temperature'),
  (2949, 'pressure'),
  (2949, 'humidity')],
 'deleted': [(2948, 'site_id'),
  (2948, 'temperature'),
  (2948, 'pressure'),
  (1258, 'pressure'),
  (2948, 'humidity'),
  (2948, 'measurement_date')],
 'unchanged': [(6263, 'pressure'),
  (4419, 'pressure'),
  (4419, 'humidity'),
  (6800, 'pressure'),
  (4467, 'site_id'),
  (4467, 'measurement_date'),
  (4467, 'temperature'),
  (4467, 'pressure'),
  (4467, 'humidity'),
  (6263, 'site_id'),
  (6800, 'temperature'),
  (6800, 'measurement_date'),
  (6800, 'site_id'),
  (6263, 'measurement_date'),
  (6263, 'temperature'),
  (6263, 'humidity'),
  (7436, 'temperature'),
  (7436, 'site_id'),
  (9295, 'pressure'),
  (9295, 'temperature'),
  (9295, 'measurement_date'),
  (9038, 'pressure'),
  (9038, 'temperature'),
  (9038, 'measurement_date'),
  (9038, 'site_id'),
  (8982, 'humidity'),
  (8982, 'pressure'),
  (8982, 'temperature'),
  (8982, 'measurement_date'),
 

In [10]:
diff = differ(*PATHS)
diff.to_record()

processing 'site_id'
processing 'measurement_date'
processing 'temperature'
processing 'pressure'
processing 'humidity'
processing 'comment'


{'schema': {'added': [],
  'deleted': [],
  'unchanged': ['site_id',
   'measurement_date',
   'temperature',
   'pressure',
   'humidity',
   'comment'],
  'changed': [],
  '_context': {}},
 'index': {'added': [2949],
  'deleted': [2948],
  'unchanged': [1258,
   1482,
   1735,
   2066,
   2502,
   2926,
   3639,
   4080,
   4311,
   4419,
   4467,
   6263,
   6800,
   7436,
   8052,
   8674,
   8982,
   9038,
   9295],
  'changed': [],
  '_context': {}},
 'values': {'added': [(2949, 'site_id'),
   (2949, 'measurement_date'),
   (2949, 'temperature'),
   (2949, 'pressure'),
   (2949, 'humidity')],
  'deleted': [(2948, 'site_id'),
   (2948, 'temperature'),
   (2948, 'pressure'),
   (1258, 'pressure'),
   (2948, 'humidity'),
   (2948, 'measurement_date')],
  'unchanged': [(6263, 'pressure'),
   (4419, 'pressure'),
   (4419, 'humidity'),
   (6800, 'pressure'),
   (4467, 'site_id'),
   (4467, 'measurement_date'),
   (4467, 'temperature'),
   (4467, 'pressure'),
   (4467, 'humidity'),
   (

In [11]:
from dacman_csv import PrettyPrint, Interactive

In [12]:
pp = PrettyPrint(diff)
pp.backend

'json'

In [13]:
pp

{
    "schema": {
        "added": [], 
        "deleted": [], 
        "unchanged": [
            "site_id", 
            "measurement_date", 
            "temperature", 
            "pressure", 
            "humidity", 
            "comment"
        ], 
        "changed": [], 
        "_context": {}
    }, 
    "index": {
        "added": [
            2949
        ], 
        "deleted": [
            2948
        ], 
        "unchanged": [
            1258, 
            1482, 
            1735, 
            2066, 
            2502, 
            2926, 
            3639, 
            4080, 
            4311, 
            4419, 
            4467, 
            6263, 
            6800, 
            7436, 
            8052, 
            8674, 
            8982, 
            9038, 
            9295
        ], 
        "changed": [], 
        "_context": {}
    }, 
    "values": {
        "added": [
            [
                2949, 
                "site_id"
            ], 
            [

In [14]:
i = Interactive(diff)
i.display()

interactive(children=(Dropdown(description='Sort by: ', options=('site_id', 'measurement_date', 'temperature',…