__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/15_Analyse_Redundancy.ipynb)__

# Analyse uniqueness, discrepancies und redundancy
* `analyse_groups`: Analyses a DataFrame for uniqueness and redundancy.
* `same_but_different`: Returns the rows of a DataFrame that are the same on the one hand and different on the other: They are the same in the fields named in same. And they differ in the field named in different. This is useful for analysing whether fields correlate 100% with each other or are independent.

In [63]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /media/me/LinuxDropbox/Dropbox
environment['lib_path']     = /media/me/LinuxDropbox/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:55:46


In [64]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
#pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

In [65]:
# Generate random data
anz = 10
v = pak.random_series( anz, 'name',                  p_nan=0)
w = v.str[:1]
g = pak.random_series( anz, 'int',   min=2, max=7 ) * 10

s = pak.random_series( anz, 'string',                p_nan=0)
o = pak.random_series( anz, 'choice', choice=['Bremen','Berlin','Hamburg'], p_nan=0.2   )
p = pak.random_series( anz, 'choice', choice=['cats','dogs']   )
a = pak.random_series( anz, 'int',   min=0, max=anz*10, p_dup=0 ) # there will be no dups
b = pak.random_series( anz, 'int',   min=0, max=anz-10          ) # there will be 10 dups

df = pak.dataframe( [ v, w, g, s, o, p, a, b] )
df.columns = ['first_name','firstletter','age_class','secret','city','loves','int_fine','int_rough',]
df

Unnamed: 0,first_name,firstletter,age_class,secret,city,loves,int_fine,int_rough
0,Tanja,T,70,PAWzUtf,Bremen,cats,80,0
1,Yasmin,Y,40,Äh8e,Hamburg,dogs,95,0
2,Mika,M,60,opIbu5l,Berlin,cats,94,0
3,Uwe,U,50,PÄlw8,Berlin,dogs,72,0
4,Jacqueline,J,20,EP6m0d,Berlin,cats,20,0
5,Irmgard,I,30,7ZÄGC,Bremen,cats,37,0
6,Nicole,N,60,äWm2ES,Hamburg,cats,22,0
7,Rudolph,R,70,RQiAao,Hamburg,dogs,76,0
8,Beate,B,70,Nd3D3,Bremen,dogs,1,0
9,Josef,J,30,üzlUj,Berlin,cats,14,0


## analyse_groups

In [66]:
?pak.analyse_groups

[31mSignature:[39m pak.analyse_groups(df, exclude=[], tiefe_max=[32m3[39m)
[31mDocstring:[39m
Analyses a DataFrame for uniqueness and redundancy.
Groups by many combinations of columns and counts the duplicates that are created in the process.
Interpretation:
0 dups => This combination of columns is unique
Same number of dups than other combination of columns => Indication of redundancy
[31mFile:[39m      /media/me/LinuxDropbox/Dropbox/31_Projekte/01_Python/88_PyCharm/pandasklar/src/pandasklar/analyse.py
[31mType:[39m      function

In [67]:
# Analyse for uniqueness and redundancy
a = pak.analyse_groups(df)
a

Unnamed: 0,columns,level,dups_abs,dups_rel
0,[first_name],1,0,0.0
1,[secret],1,0,0.0
2,[int_fine],1,0,0.0
3,"[firstletter, age_class]",2,0,0.0
4,[firstletter],1,1,0.1
5,"[firstletter, city]",2,1,0.1
6,"[firstletter, loves]",2,1,0.1
7,"[firstletter, int_rough]",2,1,0.1
8,"[age_class, city]",2,1,0.1
9,"[age_class, loves]",2,3,0.3


_Interpretation:_ 
 * column `int_fine` uniquely identifies all records
 * column `secret` uniquely identifies all records
 * columns `first_name` and `int_rough` uniquely identify all records together (this depends on te random data)
 * column `firstletter` is redundant to `first_name`

## same_but_different

In [68]:
?pak.same_but_different

[31mSignature:[39m pak.same_but_different(df, same, different, sort=[38;5;28;01mTrue[39;00m, return_mask=[38;5;28;01mFalse[39;00m)
[31mDocstring:[39m
Returns the rows of a DataFrame that are the same on the one hand and different on the other:
They are the same in the fields named in same.
And they differ in the field named in different.
This is useful for analysing whether fields correlate 100% with each other or are independent.
* same:       Array of column names.
* different:  Single column name.  This column is used to search for differences.
[31mFile:[39m      /media/me/LinuxDropbox/Dropbox/31_Projekte/01_Python/88_PyCharm/pandasklar/src/pandasklar/analyse.py
[31mType:[39m      function

In [69]:
# There is one discrepancy in the dataframe

df2 = pak.dataframe( [ list('Packesel'), 
                      list('Packesel'), 
                      list('Packesel'), 
                      list('Packese#'), 
                      list('Packesel'), 
                      list('Packesel'), 
                      list('Packesel'),       
                      list('Packesel'),                     
                ] )
df2

Unnamed: 0,A,B,C,D,E,F,G,H
0,P,a,c,k,e,s,e,l
1,P,a,c,k,e,s,e,l
2,P,a,c,k,e,s,e,l
3,P,a,c,k,e,s,e,#
4,P,a,c,k,e,s,e,l
5,P,a,c,k,e,s,e,l
6,P,a,c,k,e,s,e,l
7,P,a,c,k,e,s,e,l


In [70]:
# no discrepancy in column E
pak.same_but_different(df2, ['A','B','C','D'], 'E')

Unnamed: 0,A,B,C,D,E,F,G,H


In [71]:
# but in column H
pak.same_but_different(df2, ['A','B','C','D'], 'H')

Unnamed: 0,A,B,C,D,E,F,G,H
0,P,a,c,k,e,s,e,l
1,P,a,c,k,e,s,e,l
2,P,a,c,k,e,s,e,l
3,P,a,c,k,e,s,e,#
4,P,a,c,k,e,s,e,l
5,P,a,c,k,e,s,e,l
6,P,a,c,k,e,s,e,l
7,P,a,c,k,e,s,e,l


In [72]:
# Another example with the DataFrame from above
# (if you don't see any result, run the notebook again to generate different random data)
pak.same_but_different( df, same=['age_class','city'], different='loves' )

Unnamed: 0,first_name,firstletter,age_class,secret,city,loves,int_fine,int_rough
0,Tanja,T,70,PAWzUtf,Bremen,cats,80,0
8,Beate,B,70,Nd3D3,Bremen,dogs,1,0


In [73]:
df

Unnamed: 0,first_name,firstletter,age_class,secret,city,loves,int_fine,int_rough
0,Tanja,T,70,PAWzUtf,Bremen,cats,80,0
1,Yasmin,Y,40,Äh8e,Hamburg,dogs,95,0
2,Mika,M,60,opIbu5l,Berlin,cats,94,0
3,Uwe,U,50,PÄlw8,Berlin,dogs,72,0
4,Jacqueline,J,20,EP6m0d,Berlin,cats,20,0
5,Irmgard,I,30,7ZÄGC,Bremen,cats,37,0
6,Nicole,N,60,äWm2ES,Hamburg,cats,22,0
7,Rudolph,R,70,RQiAao,Hamburg,dogs,76,0
8,Beate,B,70,Nd3D3,Bremen,dogs,1,0
9,Josef,J,30,üzlUj,Berlin,cats,14,0


In [85]:
df = pak.people(20, seed=42)
df

Unnamed: 0,first_name,age,age_class,postal_code,birthplace,secret,features,history
0,Tina,36,30,54482.0,,iBuOBÖY,"{A, 3, j, P}","[A, C, C, B]"
1,Tanja,32,30,36790.0,,6sHrFoaH2Z,"{c, V, u}","[A, B, C, C]"
2,Renate,42,40,96188.0,,CräÖlgo,"{2, R}","[b, b, a, b]"
3,Charlotte,30,30,82124.0,,u2iXW7,"{Z, T, y}","[A, B, C]"
4,Kathrin,22,20,96769.0,Bremen,GiboaIRoL,"{o, K, y, 2, p}","[a, b, c]"
5,Edith,33,30,13709.0,Bremen,u6aHÜwÜn,{},"[c, b, a]"
6,Felix,28,20,44578.0,,MÜztVueüaP,"{o, V}","[A, x]"
7,Dieter,34,30,14748.0,Berlin,coUNEhEu,"{g, 3, v}",[]
8,Regina,29,20,35289.0,Bremen,kÄiaq,"{O, T, y, u}","[A, A, A]"
9,Carla,31,30,19435.0,Bremen,8ivHü2,"{z, f, S, P}","[A, C, C, B]"


In [87]:
result = pak.same_but_different( df, same=['first_name','age_class'], different='birthplace' )

In [88]:
dict(result)

{'first_name': 4     Kathrin
 15    Kathrin
 Name: first_name, dtype: string,
 'age': 4     22
 15    24
 Name: age, dtype: Int8,
 'age_class': 4     20
 15    20
 Name: age_class, dtype: Int8,
 'postal_code': 4     96769
 15    54482
 Name: postal_code, dtype: Int32,
 'birthplace': 4     Bremen
 15    Berlin
 Name: birthplace, dtype: string,
 'secret': 4      GiboaIRoL
 15    gmhZRnFyya
 Name: secret, dtype: string,
 'features': 4     {o, K, y, 2, p}
 15       {p, O, e, R}
 Name: features, dtype: object,
 'history': 4        [a, b, c]
 15    [A, B, C, C]
 Name: history, dtype: object}

# Spielwiese

In [74]:
import pytest
import pandas as pd
import numpy as np
import bpyth as bpy
from pandasklar.analyse import col_names, nnan, any_nan, nan_rows
from pandasklar.analyse import val_most, nunique, ntypes, analyse_freqs, sort_cols_by_nunique
from pandasklar.analyse import same_but_different
from pandasklar.content import people, random_numbers
from pandasklar.pandas import dataframe

In [94]:
        df = pd.DataFrame({'a': [1, 1, 2, 2],
                           'b': [1.0, 1.0, 1.0, 1.0],
                           'c': [3.0, 3.0, 4.0, 3.0]})
        result = same_but_different(df, same=['a', 'b'], different='c')

In [99]:
result.shape

(2, 3)

In [98]:
result['c'].tolist()

[4.0, 3.0]