# Analyse Frequencies
* `analyse_freqs`: Frequency analysis that includes a subordinate frequency analysis. Provides e.g. the most important examples per case.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

In [3]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
#pak.Config.set('VERBOSE', True)

grid = pak.grid

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 10.8 s


In [7]:
?pak.analyse_freqs

time: 63.7 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0manalyse_freqs[0m[0;34m([0m[0mdata[0m[0;34m,[0m [0mcols[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlimits[0m[0;34m=[0m[0;34m[[0m[0;34m][0m[0;34m,[0m [0msplits[0m[0;34m=[0m[0;34m[[0m[0;34m][0m[0;34m,[0m [0msort_count[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Frequency analysis that includes a subordinate frequency analysis.
Provides the most important examples of a disjunctive distribution.
* cols:        Columns to which the analysis is to be applied. Name or list of names.
               If one of the addressed columns is a list, it is exploded.
* limits:      List of limits, corresponding to cols.
               Example: [None,5] does not limit the frequencies of the first col 
               but limits the data for the second col to the 5 most common values.
* splits:      List of split characters, corresponding to cols.
               Used to explode sent

In [8]:
# quick one dimensional frequency analyis without dataframes
# This also works for series, tuples, etc
testlist=list('Lorem ipsum dolor sit amet, consectetuer adipiscing elit.')
#testlist = tuple(testlist)
pak.analyse_freqs(testlist)

Unnamed: 0,item,item_count,item_percent,graph
0,,7,12.3,######
1,i,6,10.5,#####
2,e,6,10.5,#####
3,t,5,8.8,####
4,s,4,7.0,###
5,o,4,7.0,###
6,r,3,5.3,##
7,m,3,5.3,##
8,c,3,5.3,##
9,d,2,3.5,#


time: 85 ms


In [9]:
# Create test dataframe
anz = 1000
v = pak.random_series( anz, 'name',                  p_nan=0   ).str.upper() 
w = v.str[:1]
s = pak.random_series( anz, 'string',                p_nan=0   )
t = pak.random_series( anz, 'string',                p_nan=0.1 )
p = pak.random_series( anz, 'list',     len_max=5,   p_nan=0.1 )
q = pak.random_series( anz, 'list',     len_max=5              ).str.join(' ')
df = pak.dataframe( [w, v, t, p, q] )
df.columns = ['letter','first_name','string_nan','real_list','word_list']
df

Unnamed: 0,letter,first_name,string_nan,real_list,word_list
0,M,MARIE,q8fCql6,"[Arne, Bastian, Anja]",Bastian Annica
1,T,TANJA,WüIT,"[Ursula, Niels, None]",Stella Doris Martin Jannick
2,K,KLARA,,"[Tom, Marion, Anja]",Lennart Steffen Michaela Anja
3,K,KONSTANTIN,oYuUfen,"[Paul, Martina, Erik, Frida]",Nick Stephan Daniela Karin Alina
4,R,RAINER,hPU2s,"[Hendrik, Anja, Cornelia, Gustav, Andrea]",Bruno Evelyn Renate Mathias
...,...,...,...,...,...
995,A,ANNA,X1eK5,"[Anja, Kerstin, Arthur]",Mia Kirsten
996,A,ANNA,rÄGK,"[Anna, Frida]",Kurt Victoria Maike Anja
997,J,JUTTA,17tLicY,"[Dominic, Christopher, Catharina, Tanja]",Erwin Oliver Tanja Kim Dominik
998,L,LUCA,GekCYe,"[Edith, Simon]",Lasse Tanja


time: 1.06 s


## One column => 1-dimensional analysis

In [10]:
# analyse frequencies of the values in the column 'letter'
pak.analyse_freqs(df, 'letter')

Unnamed: 0,letter,letter_count,letter_percent,graph
0,A,122,12.2,######
1,M,103,10.3,#####
2,T,98,9.8,####
3,L,69,6.9,###
4,J,63,6.3,###
5,S,61,6.1,###
6,K,48,4.8,##
7,C,48,4.8,##
8,H,48,4.8,##
9,E,44,4.4,##


time: 41 ms


## Two or more columns => 2-dimensional analysis

In [11]:
# analyse frequencies of the values in the columns 'letter, then 'first_name'
a = pak.analyse_freqs(df, ['letter','first_name',])
grid(a)

24 rows




time: 280 ms


In [12]:
# analyse frequencies of the values in the columns 'letter, then 'first_name' and 'string_nan'
# and limit the result
a = pak.analyse_freqs(df, ['letter','first_name','string_nan'], limits=[None,5,3])
a

Unnamed: 0,letter,letter_count,letter_percent,first_name,first_name_count,string_nan,string_nan_count
0,A,122,12.2,"[ANJA, ANNA, ADRIAN, ANNIKA, ALBERT]","[41, 19, 4, 4, 3]","[<NA>, Ass6, uo5vyr]","[7, 1, 1]"
1,M,103,10.3,"[MICHELLE, MARIE, MARC, MARCO, MARCEL]","[5, 4, 4, 4, 3]","[<NA>, q8fCql6, dfjO]","[9, 1, 1]"
2,T,98,9.8,"[TOM, TANJA, THERESA, TONI, TOBIAS]","[39, 33, 3, 3, 3]","[<NA>, WüIT, uf2z]","[5, 1, 1]"
3,L,69,6.9,"[LOTHAR, LUIS, LUISE, LENNART, LARA]","[4, 4, 3, 3, 3]","[<NA>, pqVf6S, aNiIyS]","[4, 1, 1]"
4,J,63,6.3,"[JOHANNES, JENNIFER, JÖRG, JENS, JACQUELINE]","[4, 4, 3, 3, 3]","[<NA>, eäq4I, ÜJäpU]","[7, 1, 1]"
5,S,61,6.1,"[SARAH, SIEGFRIED, SONJA, STEPHAN, STEFFEN]","[4, 4, 4, 4, 4]","[<NA>, ÄÖxpö, ePaGOyM]","[11, 1, 1]"
6,K,48,4.8,"[KARIN, KARL, KAY, KERSTIN, KATHRIN]","[4, 4, 3, 3, 3]","[<NA>, oYuUfen, ZNkvDs]","[2, 1, 1]"
7,C,48,4.8,"[CHRISTEL, CATHARINA, CHRISTIAN, CHARLOTTE, CARL]","[4, 4, 3, 3, 3]","[<NA>, Pwör, 1tsölR]","[4, 1, 1]"
8,H,48,4.8,"[HERTHA, HANNA, HEIDI, HERMANN, HENDRIK]","[4, 4, 4, 3, 3]","[<NA>, eQ0cDÜ, KJmxc]","[5, 1, 1]"
9,E,44,4.4,"[EILEEN, ERIK, ERWIN, EVELYN, EMIL]","[4, 3, 3, 3, 3]","[<NA>, 8zöänb1, Bh8sj3]","[2, 1, 1]"


time: 218 ms


In [13]:
# Position 2 and 3 swapped: no big difference => it's only a 2-dimensional analysis!
a = pak.analyse_freqs(df, ['letter','string_nan','first_name'], limits=[3,3,5])
a

Unnamed: 0,letter,letter_count,letter_percent,string_nan,string_nan_count,first_name,first_name_count
0,A,122,12.2,"[<NA>, Ass6, uo5vyr]","[7, 1, 1]","[ANJA, ANNA, ADRIAN, ANNIKA, ALBERT]","[41, 19, 4, 4, 3]"
1,M,103,10.3,"[<NA>, q8fCql6, dfjO]","[9, 1, 1]","[MICHELLE, MARIE, MARC, MARCO, MARCEL]","[5, 4, 4, 4, 3]"
2,T,98,9.8,"[<NA>, WüIT, uf2z]","[5, 1, 1]","[TOM, TANJA, THERESA, TONI, TOBIAS]","[39, 33, 3, 3, 3]"


time: 104 ms


## Exploding lists and strings

In [14]:
# lists get exploded
a = pak.analyse_freqs(df, ['real_list','letter'], limits=[None,3] )
a

Unnamed: 0,real_list,real_list_count,real_list_percent,letter,letter_count
0,Tom,277,8.2,"[A, M, T]","[36, 27, 22]"
1,Anja,235,6.9,"[M, T, A]","[29, 29, 23]"
2,Tanja,214,6.3,"[A, M, T]","[30, 23, 20]"
3,Anna,141,4.2,"[A, L, M]","[22, 16, 13]"
4,Annika,14,0.4,"[I, A, M]","[2, 2, 2]"
...,...,...,...,...,...
430,Artur,1,0.0,J,1
431,Manuela,1,0.0,W,1
432,Benjamin,1,0.0,C,1
433,Jörg,1,0.0,T,1


time: 217 ms


In [15]:
# lists get exploded
a = pak.analyse_freqs(df, ['letter','real_list',])
a

Unnamed: 0,letter,letter_count,letter_percent,real_list,real_list_count
0,A,122,12.2,"[Tom, Tanja, Anja, Anna, None, nan, Nils, Astr...","[36, 30, 23, 22, 8, 7, 4, 4, 4, 4, 4, 4, 3, 3,..."
1,M,103,10.3,"[Anja, Tom, Tanja, Anna, None, nan, Kathrin, L...","[29, 27, 23, 13, 10, 5, 5, 4, 3, 3, 3, 3, 3, 3..."
2,T,98,9.8,"[Anja, Tom, Tanja, Anna, None, Torsten, Paula,...","[29, 22, 20, 10, 6, 3, 3, 3, 3, 3, 3, 3, 2, 2,..."
3,L,69,6.9,"[Anja, Anna, Tom, Tanja, None, Mathias, Elke, ...","[19, 16, 15, 12, 6, 3, 3, 3, 3, 2, 2, 2, 2, 2,..."
4,J,63,6.3,"[Tom, Anja, Tanja, Anna, Christina, Elly, Manf...","[21, 17, 17, 7, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, ..."
5,S,61,6.1,"[Tom, Tanja, Anja, Anna, nan, None, Oskar, Ral...","[19, 18, 10, 8, 4, 4, 3, 2, 2, 2, 2, 2, 2, 2, ..."
6,K,48,4.8,"[Tom, Anja, Tanja, Anna, Herbert, Angela, Mari...","[19, 11, 9, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
7,C,48,4.8,"[Anja, Tom, Tanja, Anna, Fynn, Ali, Jana, Rita...","[15, 9, 6, 6, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,..."
8,H,48,4.8,"[Anja, Tanja, Tom, Anna, Henry, Marko, Emilie,...","[11, 10, 9, 8, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
9,E,44,4.4,"[Tom, Anja, Anna, Bianca, Georg, Lea, Robin, T...","[15, 11, 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2..."


time: 115 ms


In [16]:
# word_list (a string) can also be exploded, use the splits argument for this
a = pak.analyse_freqs(df, ['letter','word_list',], splits=[None,' '])
a

Unnamed: 0,letter,letter_count,letter_percent,word_list,word_list_count
0,A,122,12.2,"[Anja, Anna, Tom, Tanja, Niclas, Stephanie, Be...","[32, 28, 27, 20, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3,..."
1,M,103,10.3,"[Tom, Anja, Tanja, Anna, Lara, Elli, Peter, Ma...","[27, 23, 22, 10, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2,..."
2,T,98,9.8,"[Tom, Tanja, Anja, Anna, Rosemarie, Willi, Mar...","[30, 24, 20, 15, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2,..."
3,L,69,6.9,"[Tanja, Tom, Anja, Anna, Emil, Frida, Larissa,...","[21, 18, 9, 5, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2..."
4,J,63,6.3,"[Tom, Anja, Tanja, Anna, Sylvia, Werner, Nele,...","[16, 15, 12, 6, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, ..."
5,S,61,6.1,"[Tom, Tanja, Anja, Hanna, Anna, Stella, Sandra...","[22, 21, 10, 5, 5, 3, 3, 3, 2, 2, 2, 2, 2, 2, ..."
6,K,48,4.8,"[Tom, Anja, Tanja, Anna, Kim, Steffen, Jasmin,...","[21, 14, 8, 7, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
7,C,48,4.8,"[Anja, Tanja, Tom, Anna, Jonathan, Hannes, Max...","[13, 12, 10, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, ..."
8,H,48,4.8,"[Anja, Tom, Anna, Tanja, Louise, Sigrid, Frank...","[14, 11, 11, 6, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ..."
9,E,44,4.4,"[Anja, Tom, Anna, Tanja, Barbara, Erik, Hannel...","[12, 11, 9, 8, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2..."


time: 128 ms


In [17]:
# word_list is exploded as first dimension
a = pak.analyse_freqs(df, cols=['word_list','letter'], splits=[' '], limits=[5,3])
a

Unnamed: 0,word_list,word_list_count,word_list_percent,letter,letter_count
0,Tom,291,8.1,"[T, A, M]","[30, 27, 27]"
1,Anja,238,6.6,"[A, M, T]","[32, 23, 20]"
2,Tanja,232,6.4,"[T, M, S]","[24, 22, 21]"
3,Anna,147,4.1,"[A, T, H]","[28, 15, 11]"
4,Sofie,13,0.4,"[S, L, T]","[2, 2, 2]"


time: 98.6 ms


In [18]:
# string_nan is exploded into single characters
a = pak.analyse_freqs(df, 'string_nan', splits = [''] )
a

Unnamed: 0,string_nan,string_nan_count,string_nan_percent,graph
0,,1852,26.6,#############
1,a,141,2.0,#
2,i,140,2.0,#
3,u,140,2.0,#
4,o,135,1.9,
...,...,...,...,...
64,f,60,0.9,
65,O,59,0.8,
66,g,58,0.8,
67,l,53,0.8,


time: 71 ms
