__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/16_Compare_Series_and_DataFrames.ipynb)__

# Compare Series and DataFrames
* `compare_series`: Compares the content of two Series.
    Returns several indicators of equality.
* `compare_dataframes`: Compares the content of two DataFrames column by column. Returns several indicators of equality.
* `check_equal`: Compares the content of two DataFrames column by column.
* `compare_col_dtype`: Returns the column names of two DataFrames whose dtype differs
* `get_different_rows`: Returns the rows of two DataFrames that differ

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:41:32


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



## compare_series()

In [3]:
?pak.compare_series

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mcompare_series[0m[0;34m([0m[0ms[0m[0;34m,[0m [0mt[0m[0;34m,[0m [0mformat[0m[0;34m=[0m[0;34m'dict'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compares the content of two Series.
Returns several indicators of equality as dict or DataFrame:
    name:    same name    
    dtype:   nearly same dtype (Float32 == Float64)
    len:     same shape        
    nnan:    same number of NaNs   
    content: same content, ignoring index and sort
    sort:    same sort order, ignoring index
    eq:      same relations index->data, ignoring sort
    
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/compare.py
[0;31mType:[0m      function


In [4]:
# Generate test data
s = pak.random_series( 100, 's')
s = s.apply(pak.decorate, p=0.1) # nan
s

0         NaN
1        g8bm
2     igHdÜhT
3         NaN
4       ÖqeGa
       ...   
95      ÄZJ9i
96     QpY8qÄ
97       ÄoQ7
98      1G80b
99       Dgse
Name: rnd_string, Length: 100, dtype: object

In [5]:
# Generate compare data
# Play with it!

t = s.copy()
# t.name = 's' # name
# t = t[:99] # len
# t = t.apply(pak.decorate, p=0.5) # nan
#t = t.astype('object') # dtype

t[0], t[1] = t[1], t[0] 
#t = t.sort_values()



In [6]:
r = pak.compare_series(s,t, format='df')
r

Unnamed: 0,rnd_string
name,True
dtype,True
len,True
nnan,True
content,True
sort,False
eq,False


## compare_dataframes()

In [7]:
?pak.compare_dataframes

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mcompare_dataframes[0m[0;34m([0m[0mdf1[0m[0;34m,[0m [0mdf2[0m[0;34m,[0m [0mreset_index[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mformat[0m[0;34m=[0m[0;34m'df'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compares the content of two DataFrames column by column.
Returns several indicators of equality:
    name:    True, left_only or right_only. True means the column exists in both DataFrames.     
    dtype:   columns have same dtype     
    nnan:    columns have same number of NaNs   
    content: columns have same content, ignoring index and sort
    sort:    columns have same sort order, ignoring index
    eq:      columns have same relations index->data, ignoring sort    

* reset_index: Set True to ignore index and sort order or the rows.    
* format:      'DataFrame', 'Series', 'dict' or 'bool' (or abbreviations of this):
               Output format. format='DataFrame' will return detailed i

In [8]:
# Generate test data
s = pak.people()
s

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Dagmar,29,20,49972,Bremen,yzÄZÄqSggä,{o},"[b, b, a, b]"
1,Horst,33,30,,Berlin,it591RäNu,"{y, U, b}","[A, B, C]"
2,Gerhard,29,20,65535,Berlin,iUJHupi,"{r, c, q, 3}","[A, x]"
3,Leonie,31,30,58137,Berlin,1xPaJ,"{R, I, j}","[A, B, C, C]"
4,Chiara,34,30,89168,Bremen,1B4vPe,"{a, M, p, D, w}","[c, b, a]"
...,...,...,...,...,...,...,...,...
95,Sven,22,20,27932,Berlin,BBqOÖC1Rm,"{R, I, j}",[]
96,Karsten,24,20,28944,Berlin,aidxmjZ,"{x, D, G}","[b, b, a, b]"
97,Tanja,22,20,41251,,Oj8vZaoöC9,{o},"[A, x]"
98,Dominik,31,30,23657,,dEVAqÄQ,"{1, 5, c, 4}","[b, b, a, b]"


In [9]:
# Generate compare data
# Play with it!

t = s.copy()
# t.name = 's' # name
# t = t[:99] # len
# t = t.apply(pak.decorate, p=0.5) # nan
t['age'] = t.age.astype('float') # dtype

#t = t.sort_values()
#t = pak.drop_cols(t, 'age')
t['AAGE'] = 0
#t = t.sort_values(['first_name'])
#t.loc[0,'age'] = None
#t = t.head(50)
t= pak.move_cols(t,'age',-1)
t



Unnamed: 0,first_name,age_class,postal_code,birthplace,secret,features,history,AAGE,age
0,Dagmar,20,49972,Bremen,yzÄZÄqSggä,{o},"[b, b, a, b]",0,29.0
1,Horst,30,,Berlin,it591RäNu,"{y, U, b}","[A, B, C]",0,33.0
2,Gerhard,20,65535,Berlin,iUJHupi,"{r, c, q, 3}","[A, x]",0,29.0
3,Leonie,30,58137,Berlin,1xPaJ,"{R, I, j}","[A, B, C, C]",0,31.0
4,Chiara,30,89168,Bremen,1B4vPe,"{a, M, p, D, w}","[c, b, a]",0,34.0
...,...,...,...,...,...,...,...,...,...
95,Sven,20,27932,Berlin,BBqOÖC1Rm,"{R, I, j}",[],0,22.0
96,Karsten,20,28944,Berlin,aidxmjZ,"{x, D, G}","[b, b, a, b]",0,24.0
97,Tanja,20,41251,,Oj8vZaoöC9,{o},"[A, x]",0,22.0
98,Dominik,30,23657,,dEVAqÄQ,"{1, 5, c, 4}","[b, b, a, b]",0,31.0


In [10]:
# Output as DataFrame
pak.compare_dataframes(s,t)

Unnamed: 0,name,dtype,nnan,content,sort,eq
first_name,True,True,True,True,True,True
age_class,True,True,True,True,True,True
postal_code,True,True,True,True,True,True
birthplace,True,True,True,True,True,True
secret,True,True,True,True,True,True
features,True,True,True,True,True,True
history,True,True,True,True,True,True
age,True,False,True,True,True,True
AAGE,right_only,,,False,,False
(Total),False,False,False,False,False,False


In [11]:
# Output as dict
pak.compare_dataframes(s,t, format='dict')

{'name': False,
 'dtype': False,
 'nnan': False,
 'content': False,
 'sort': False,
 'eq': False}

In [12]:
# Output as bool
pak.compare_dataframes(s,t, format='bool')

False

In [13]:
# This ist the same as check_equal
pak.check_equal(s,t)

False

## check_equal()

In [14]:
?pak.check_equal

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mcheck_equal[0m[0;34m([0m[0mobj1[0m[0;34m,[0m [0mobj2[0m[0;34m,[0m [0mreset_index[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compares the content of two DataFrames column by column.
Two DataFrames are equal, if 
* they have the same shape
* they have the same column names
* and compare_dataframes(format='bool') is True
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/compare.py
[0;31mType:[0m      function


In [15]:
df1 = pak.dataframe( [ list('Babykorb'), 
                       list('abfällig'), 
                       list('Abgründe'), 
                       list('Kätzchen'), 
                       list('Landwirt'), 
                       list('lebendig'), 
                       list('Saugrohr'),       
                       list('Trugbild'),                     
                ] )

df2 = pak.dataframe( [ list('Babykorb'), 
                       list('abfällig'), 
                       list('Abgründe'), 
                       list('Kätzchen'), 
                       list('Landwirt'), 
                       list('lebendig'), 
                       list('Saugrohr'),       
                       list('Trugbild'),                     
                ] )

df1

Input rtype=('list', 'list', 'str') shape=(8, 8)
rotated=False Output rtype=('DataFrame', 'Series') shape=(8, 8)
Input rtype=('list', 'list', 'str') shape=(8, 8)
rotated=False Output rtype=('DataFrame', 'Series') shape=(8, 8)


Unnamed: 0,A,B,C,D,E,F,G,H
0,B,a,b,y,k,o,r,b
1,a,b,f,ä,l,l,i,g
2,A,b,g,r,ü,n,d,e
3,K,ä,t,z,c,h,e,n
4,L,a,n,d,w,i,r,t
5,l,e,b,e,n,d,i,g
6,S,a,u,g,r,o,h,r
7,T,r,u,g,b,i,l,d


In [16]:
# Initially the DataFrames are equal
assert pak.check_equal(df1, df2)

In [17]:
# One change >> not equal
mask = df2['A'] == 'L'
df2.loc[mask,'A'] = 'R'
assert not pak.check_equal(df1, df2)

In [18]:
# Change back >> equal again
mask = df2['A'] == 'R'
df2.loc[mask,'A'] = 'L'
assert pak.check_equal(df1, df2)

In [19]:
# change column order and row order 
df2 = pak.move_cols(df2,'D').sort_values('D')
df2

Unnamed: 0,D,A,B,C,E,F,G,H
4,d,L,a,n,w,i,r,t
5,e,l,e,b,n,d,i,g
6,g,S,a,u,r,o,h,r
7,g,T,r,u,b,i,l,d
2,r,A,b,g,ü,n,d,e
0,y,B,a,b,k,o,r,b
3,z,K,ä,t,c,h,e,n
1,ä,a,b,f,l,l,i,g


In [20]:
# still equal
assert pak.check_equal(df1, df2)

## compare_col_dtype()

In [21]:
?pak.compare_col_dtype

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mcompare_col_dtype[0m[0;34m([0m[0mdf1[0m[0;34m,[0m [0mdf2[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Returns the column names of two DataFrames whose dtype differs.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/compare.py
[0;31mType:[0m      function


In [22]:
pak.compare_col_dtype(s, t)

['age']

## get_different_rows()

In [23]:
?pak.get_different_rows

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mget_different_rows[0m[0;34m([0m[0mdf1[0m[0;34m,[0m [0mdf2[0m[0;34m,[0m [0mindicator[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns the rows of two DataFrames that differ. 
Additional or missing columns are ignored.
Float columns may cause mistakes.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/compare.py
[0;31mType:[0m      function


In [24]:
# Generate compare data
# Play with it!

t = s.copy()

# change one value
t.loc[0,'age'] = -1

# change order
t = t.sort_values(['first_name'])
t= pak.move_cols(t,'age',-1)

# additional or missing columns are ignored
t['AAGE'] = 0
t = pak.drop_cols(t, 'secret')

t

Unnamed: 0,first_name,age_class,postal_code,birthplace,features,history,age,AAGE
54,Alina,30,79506,Berlin,"{R, B}","[A, B, C, C]",33,0
64,Anja,20,63729,Berlin,"{C, h}","[A, B, C]",25,0
7,Anna,30,92918,,"{m, X, n, 5, P}","[A, C, C, B]",36,0
73,Anna,20,58137,Bremen,"{m, n, F, N}","[A, B, C, C]",26,0
79,Barbara,20,34759,Bremen,"{A, B}","[A, B, C, C]",26,0
...,...,...,...,...,...,...,...,...
67,Ursula,30,87850,Bremen,"{o, D}","[c, b, a]",30,0
26,Vivien,30,33177,Bremen,"{y, T, X, b}",[],31,0
58,Waltraud,30,10090,Bremen,"{1, x}","[b, b, a, b]",31,0
71,Yasmin,20,10069,,"{s, q, F, 0, N}","[A, B, C]",28,0


In [25]:
pak.get_different_rows(s, t)

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history,AAGE,_merge
0,Alina,33,30,79506,Berlin,440tzä,"0 {o}\n1 {y, U, b}\n...","0 [b, b, a, b]\n1 [A, B, C]\n2 ...",,left_only
1,Anja,25,20,63729,Berlin,Mp7ghapÜ,"0 {o}\n1 {y, U, b}\n...","0 [b, b, a, b]\n1 [A, B, C]\n2 ...",,left_only
2,Anna,26,20,58137,Bremen,BKXnzÜEm,"0 {o}\n1 {y, U, b}\n...","0 [b, b, a, b]\n1 [A, B, C]\n2 ...",,left_only
3,Anna,36,30,92918,,SXuUox8D5,"0 {o}\n1 {y, U, b}\n...","0 [b, b, a, b]\n1 [A, B, C]\n2 ...",,left_only
4,Barbara,26,20,34759,Bremen,ÜbXi1sa5Oe,"0 {o}\n1 {y, U, b}\n...","0 [b, b, a, b]\n1 [A, B, C]\n2 ...",,left_only
...,...,...,...,...,...,...,...,...,...,...
195,Ursula,30,30,87850,Bremen,,"54 {R, B}\n64 {C, h}\n...","54 [A, B, C, C]\n64 [A, B, C]\n7 ...",0.0,right_only
196,Vivien,31,30,33177,Bremen,,"54 {R, B}\n64 {C, h}\n...","54 [A, B, C, C]\n64 [A, B, C]\n7 ...",0.0,right_only
197,Waltraud,31,30,10090,Bremen,,"54 {R, B}\n64 {C, h}\n...","54 [A, B, C, C]\n64 [A, B, C]\n7 ...",0.0,right_only
198,Yasmin,28,20,10069,,,"54 {R, B}\n64 {C, h}\n...","54 [A, B, C, C]\n64 [A, B, C]\n7 ...",0.0,right_only


In [26]:
# does not work
#s.compare(t, align_axis=1,keep_shape=False)