__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/14_Analyse_Frequencies.ipynb)__

# Analyse Frequencies
* `analyse_freqs`: Frequency analysis that includes a subordinate frequency analysis. Provides e.g. the most important examples per case.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 21:40:39


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
#pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

In [3]:
?pak.analyse_freqs

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0manalyse_freqs[0m[0;34m([0m[0mdata[0m[0;34m,[0m [0mcols[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlimits[0m[0;34m=[0m[0;34m[[0m[0;34m][0m[0;34m,[0m [0msplits[0m[0;34m=[0m[0;34m[[0m[0;34m][0m[0;34m,[0m [0msort_count[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Frequency analysis that includes a subordinate frequency analysis. 
Provides e.g. the most important examples per case. Splits strings and lists.
* cols:        Columns to which the analysis is to be applied. Name or list of names.
               If one of the addressed columns is a list, it is exploded.
* limits:      List of limits, corresponding to cols.
               Example: [None,5] does not limit the frequencies of the first col 
               but limits the data for the second col to the 5 most common values.
* splits:      List of split characters, corresponding to cols.
               Used to e

In [4]:
# quick one dimensional frequency analyis without dataframes
# This also works for series, tuples, etc
testlist=list('Lorem ipsum dolor sit amet, consectetuer adipiscing elit.')
#testlist = tuple(testlist)
pak.analyse_freqs(testlist)

Unnamed: 0,item,item_count,item_percent,graph
0,,7,12.3,######
1,i,6,10.5,#####
2,e,6,10.5,#####
3,t,5,8.8,####
4,s,4,7.0,###
5,o,4,7.0,###
6,r,3,5.3,##
7,m,3,5.3,##
8,c,3,5.3,##
9,d,2,3.5,#


In [5]:
# Create test dataframe
anz = 1000
v = pak.random_series( anz, 'name',                  p_nan=0   ).str.upper() 
w = v.str[:1]
s = pak.random_series( anz, 'string',                p_nan=0   )
t = pak.random_series( anz, 'string',                p_nan=0.1 )
p = pak.random_series( anz, 'list',     len_max=5,   p_nan=0.1 )
q = pak.random_series( anz, 'list',     len_max=5              ).str.join(' ')
df = pak.dataframe( [w, v, t, p, q] )
df.columns = ['letter','first_name','string_nan','real_list','word_list']
df

Unnamed: 0,letter,first_name,string_nan,real_list,word_list
0,A,ANJA,oooKrHä,"[Anja, Herbert, Rudolph, Robert, Jakob]",Axel Rudolph Tanja
1,H,HERBERT,BnLYdDk,"[Barbara, Alexandra, Luis]",Tanja Felix Otto
2,V,VINCENT,lxWpax,"[Tanja, Helga]",Anke Tom Niko Anna Elena
3,A,ALFRED,3o4p,"[Volker, Heinz, Christina, Niklas, Heinrich]",Steffen Anja Angela Josephine Maik
4,W,WOLFGANG,Öhga,"[Anja, Volker]",Hildegard Tanja Anja
...,...,...,...,...,...
995,V,VIKTORIA,YNfoöIü,"[Walter, Tom]",Anna David Tom Michaela
996,M,MATTHIAS,8HJIqu,"[Regina, Dennis, Angela, Ben]",Tom Nele Jasmin Tanja
997,H,HENRI,AZYDuu,"[Raphael, Marko, Lothar, Christel, Jannik]",Marc Jonathan Tanja
998,R,RENATE,ip6U,"[Tanja, Anja, Noah, Ulrike]",Annika Tom Pauline Anja


## One column => 1-dimensional analysis

In [6]:
# analyse frequencies of the values in the column 'letter'
pak.analyse_freqs(df, 'letter')

Unnamed: 0,letter,letter_count,letter_percent,graph
0,A,133,13.3,######
1,T,113,11.3,#####
2,M,105,10.5,#####
3,L,62,6.2,###
4,J,59,5.9,##
5,S,56,5.6,##
6,H,52,5.2,##
7,C,48,4.8,##
8,E,47,4.7,##
9,K,40,4.0,##


## Two or more columns => 2-dimensional analysis

In [7]:
# analyse frequencies of the values in the columns 'letter, then 'first_name'
a = pak.analyse_freqs(df, ['letter','first_name',])
a

Unnamed: 0,letter,letter_count,letter_percent,first_name,first_name_count
0,A,133,13.3,"[ANJA, ANNA, ANGELA, ANTON, ANNELIESE, ALINA, ...","[42, 17, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2..."
1,T,113,11.3,"[TANJA, TOM, THOMAS, TIMM, THORSTEN, THEO, TON...","[43, 39, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1]"
2,M,105,10.5,"[MARCUS, MEIKE, MERLE, MELANIE, MALTE, MANFRED...","[4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, ..."
3,L,62,6.2,"[LUKAS, LILLY, LINUS, LAURA, LASSE, LOUIS, LEO...","[4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,J,59,5.9,"[JACQUELINE, JANNICK, JULE, JANNIK, JOSEF, JAN...","[4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, ..."
5,S,56,5.6,"[SOPHIA, SASCHA, SILVIA, STEPHAN, STELLA, SINA...","[5, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
6,H,52,5.2,"[HOLGER, HEINZ, HERBERT, HENDRIK, HANNAH, HERM...","[5, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ..."
7,C,48,4.8,"[CHRISTIANE, CLARA, CHRISTINE, CARINA, CELINE,...","[5, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
8,E,47,4.7,"[ERIK, ERICH, ELSE, ELIAS, EMMA, ERNA, EVA, ER...","[4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, ..."
9,K,40,4.0,"[KONSTANTIN, KARLA, KERSTIN, KLARA, KARSTEN, K...","[4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, ..."


In [8]:
# analyse frequencies of the values in the columns 'letter, then 'first_name' and 'string_nan'
# and limit the result
a = pak.analyse_freqs(df, ['letter','first_name','string_nan'], limits=[None,5,3])
a

Unnamed: 0,letter,letter_count,letter_percent,first_name,first_name_count,string_nan,string_nan_count
0,A,133,13.3,"[ANJA, ANNA, ANGELA, ANTON, ANNELIESE]","[42, 17, 5, 5, 4]","[<NA>, oooKrHä, 3o4p]","[12, 1, 1]"
1,T,113,11.3,"[TANJA, TOM, THOMAS, TIMM, THORSTEN]","[43, 39, 4, 4, 3]","[<NA>, O0nP, okRCA]","[14, 1, 1]"
2,M,105,10.5,"[MARCUS, MEIKE, MERLE, MELANIE, MALTE]","[4, 4, 4, 3, 3]","[<NA>, öqjDEII, NmTnj]","[10, 1, 1]"
3,L,62,6.2,"[LUKAS, LILLY, LINUS, LAURA, LASSE]","[4, 3, 3, 3, 3]","[<NA>, xSieC, Yzoi]","[3, 1, 1]"
4,J,59,5.9,"[JACQUELINE, JANNICK, JULE, JANNIK, JOSEF]","[4, 4, 4, 3, 3]","[<NA>, JIku, t3KP4u]","[6, 1, 1]"
5,S,56,5.6,"[SOPHIA, SASCHA, SILVIA, STEPHAN, STELLA]","[5, 4, 3, 3, 3]","[<NA>, Aqüs9, ev3QSb]","[5, 1, 1]"
6,H,52,5.2,"[HOLGER, HEINZ, HERBERT, HENDRIK, HANNAH]","[5, 4, 3, 3, 3]","[<NA>, BnLYdDk, vB693u]","[4, 1, 1]"
7,C,48,4.8,"[CHRISTIANE, CLARA, CHRISTINE, CARINA, CELINE]","[5, 4, 3, 3, 2]","[<NA>, pY4v7YK, VHlhTi5]","[4, 1, 1]"
8,E,47,4.7,"[ERIK, ERICH, ELSE, ELIAS, EMMA]","[4, 3, 3, 3, 3]","[<NA>, ISDX8, ouiBkaf]","[4, 1, 1]"
9,K,40,4.0,"[KONSTANTIN, KARLA, KERSTIN, KLARA, KARSTEN]","[4, 3, 3, 3, 3]","[<NA>, Vji4, 5vq9Cn]","[7, 1, 1]"


In [9]:
# Position 2 and 3 swapped: no big difference => it's only a 2-dimensional analysis!
a = pak.analyse_freqs(df, ['letter','string_nan','first_name'], limits=[3,3,5])
a

Unnamed: 0,letter,letter_count,letter_percent,string_nan,string_nan_count,first_name,first_name_count
0,A,133,13.3,"[<NA>, oooKrHä, 3o4p]","[12, 1, 1]","[ANJA, ANNA, ANGELA, ANTON, ANNELIESE]","[42, 17, 5, 5, 4]"
1,T,113,11.3,"[<NA>, O0nP, okRCA]","[14, 1, 1]","[TANJA, TOM, THOMAS, TIMM, THORSTEN]","[43, 39, 4, 4, 3]"
2,M,105,10.5,"[<NA>, öqjDEII, NmTnj]","[10, 1, 1]","[MARCUS, MEIKE, MERLE, MELANIE, MALTE]","[4, 4, 4, 3, 3]"


## Exploding lists and strings

In [10]:
# lists get exploded
a = pak.analyse_freqs(df, ['real_list','letter'], limits=[None,3] )
a

Unnamed: 0,real_list,real_list_count,real_list_percent,letter,letter_count
0,Tom,264,7.9,"[A, T, M]","[36, 32, 27]"
1,Anja,203,6.1,"[A, T, M]","[33, 23, 18]"
2,Tanja,195,5.8,"[A, M, J]","[24, 22, 17]"
3,Anna,117,3.5,"[T, A, S]","[19, 13, 11]"
4,Heinz,17,0.5,"[A, K, T]","[3, 3, 2]"
...,...,...,...,...,...
429,Clara,1,0.0,F,1
430,Kirsten,1,0.0,I,1
431,Carolin,1,0.0,S,1
432,Stella,1,0.0,K,1


In [11]:
# lists get exploded
a = pak.analyse_freqs(df, ['letter','real_list',])
a

Unnamed: 0,letter,letter_count,letter_percent,real_list,real_list_count
0,A,133,13.3,"[Tom, Anja, Tanja, Anna, None, nan, Kay, Jenni...","[36, 33, 24, 13, 8, 8, 5, 4, 4, 4, 4, 3, 3, 3,..."
1,T,113,11.3,"[Tom, Anja, Anna, Tanja, None, Marcel, Angela,...","[32, 23, 19, 14, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3,..."
2,M,105,10.5,"[Tom, Tanja, Anja, Anna, nan, None, Henry, Uwe...","[27, 22, 18, 11, 7, 5, 4, 4, 3, 3, 3, 3, 3, 3,..."
3,L,62,6.2,"[Anja, Tanja, Tom, Anna, None, Rudolf, Yvonne,...","[14, 13, 9, 7, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2..."
4,J,59,5.9,"[Tanja, Tom, Anja, None, Anna, Barbara, Phil, ...","[17, 14, 7, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2..."
5,S,56,5.6,"[Tanja, Anna, Tom, Anja, nan, Marco, Sophie, J...","[11, 11, 10, 10, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,..."
6,H,52,5.2,"[Tom, Anja, Tanja, Anna, Luis, Astrid, Sylvia,...","[15, 13, 13, 9, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
7,C,48,4.8,"[Tom, Anja, Tanja, Anna, None, Renate, Nico, n...","[11, 10, 7, 5, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2..."
8,E,47,4.7,"[Anja, Tanja, Tom, nan, Sabine, Alexandra, Ben...","[11, 9, 8, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,..."
9,K,40,4.0,"[Tom, Anja, Tanja, Anna, Heinz, Vincent, nan, ...","[13, 10, 9, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2..."


In [12]:
# word_list (a string) can also be exploded, use the splits argument for this
a = pak.analyse_freqs(df, ['letter','word_list',], splits=[None,' '])
a

Unnamed: 0,letter,letter_count,letter_percent,word_list,word_list_count
0,A,133,13.3,"[Tom, Anja, Tanja, Anna, Karsten, Swen, Margar...","[41, 32, 25, 20, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,..."
1,T,113,11.3,"[Tanja, Tom, Anja, Anna, Erich, Erwin, Dagmar,...","[31, 28, 27, 18, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
2,M,105,10.5,"[Tom, Tanja, Anja, Anna, Lars, Kevin, Luise, I...","[39, 20, 15, 13, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,..."
3,L,62,6.2,"[Tom, Anja, Tanja, Anna, Thorsten, Alexandra, ...","[25, 16, 12, 7, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, ..."
4,J,59,5.9,"[Tom, Tanja, Anna, Anja, Gerhard, Leon, Christ...","[17, 16, 9, 8, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
5,S,56,5.6,"[Tanja, Tom, Anna, Anja, Fabian, Jannis, Maria...","[19, 17, 11, 10, 4, 3, 3, 3, 2, 2, 2, 2, 2, 2,..."
6,H,52,5.2,"[Tom, Tanja, Anja, Celine, Catharina, Helene, ...","[19, 16, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
7,C,48,4.8,"[Tom, Anna, Anja, Tanja, Chiara, Linus, Josefi...","[15, 13, 12, 5, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
8,E,47,4.7,"[Anja, Tanja, Tom, Anna, Martha, Louisa, Anton...","[15, 11, 8, 8, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
9,K,40,4.0,"[Tom, Anja, Tanja, Anna, Antonia, Louisa, Mark...","[15, 11, 7, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1..."


In [13]:
# word_list is exploded as first dimension
a = pak.analyse_freqs(df, cols=['word_list','letter'], splits=[' '], limits=[5,3])
a

Unnamed: 0,word_list,word_list_count,word_list_percent,letter,letter_count
0,Tom,322,9.2,"[A, M, T]","[41, 39, 28]"
1,Tanja,230,6.6,"[T, A, M]","[31, 25, 20]"
2,Anja,221,6.3,"[A, T, L]","[32, 27, 16]"
3,Anna,146,4.2,"[A, T, M]","[20, 18, 13]"
4,Max,14,0.4,"[A, M, T]","[3, 3, 1]"


In [14]:
# string_nan is exploded into single characters
a = pak.analyse_freqs(df, 'string_nan', splits = [''] )
a

Unnamed: 0,string_nan,string_nan_count,string_nan_percent,graph
0,,1802,26.8,#############
1,o,162,2.4,#
2,a,161,2.4,#
3,i,148,2.2,#
4,e,141,2.1,#
...,...,...,...,...
64,N,52,0.8,
65,K,51,0.8,
66,L,50,0.7,
67,3,50,0.7,
