__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/14_Analyse_Frequencies.ipynb)__

# Analyse Frequencies
* `analyse_freqs`: Frequency analysis that includes a subordinate frequency analysis. Provides e.g. the most important examples per case.

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /media/me/LinuxDropbox/Dropbox
environment['lib_path']     = /media/me/LinuxDropbox/Dropbox/31_Projekte/01_Python/libs
Start Time: 17:45:02


In [2]:
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
#pak.Config.set('VERBOSE', True)

# copy_on_write
pd.set_option("mode.copy_on_write", True)

In [3]:
?pak.analyse_freqs

[31mSignature:[39m pak.analyse_freqs(data, cols=[38;5;28;01mNone[39;00m, limits=[], splits=[], sort_count=[38;5;28;01mTrue[39;00m)
[31mDocstring:[39m
Frequency analysis that includes a subordinate frequency analysis. 
Provides e.g. the most important examples per case. Splits strings and lists.
* cols:        Columns to which the analysis is to be applied. Name or list of names.
               If one of the addressed columns is a list, it is exploded.
* limits:      List of limits, corresponding to cols.
               Example: [None,5] does not limit the frequencies of the first col 
               but limits the data for the second col to the 5 most common values.
* splits:      List of split characters, corresponding to cols.
               Used to explode sentences into words
* sort_count:  True =>  result shows the most common contents first
               False => result is sorted by group
See the jupyter notebook for examples.
[31mFile:[39m      /media/me/LinuxDropbox/

In [4]:
# quick one dimensional frequency analyis without dataframes
# This also works for series, tuples, etc
testlist=list('Lorem ipsum dolor sit amet, consectetuer adipiscing elit.')
#testlist = tuple(testlist)
pak.analyse_freqs(testlist)

Unnamed: 0,item,item_count,item_percent,graph
0,,7,12.3,######
1,i,6,10.5,#####
2,e,6,10.5,#####
3,t,5,8.8,####
4,s,4,7.0,###
5,o,4,7.0,###
6,r,3,5.3,##
7,m,3,5.3,##
8,c,3,5.3,##
9,d,2,3.5,#


In [5]:
# Create test dataframe
anz = 1000
v = pak.random_series( anz, 'name',                  p_nan=0   ).str.upper() 
w = v.str[:1]
s = pak.random_series( anz, 'string',                p_nan=0   )
t = pak.random_series( anz, 'string',                p_nan=0.1 )
p = pak.random_series( anz, 'list',     len_max=5,   p_nan=0.1 )
q = pak.random_series( anz, 'list',     len_max=5              ).str.join(' ')
df = pak.dataframe( [w, v, t, p, q] )
df.columns = ['letter','first_name','string_nan','real_list','word_list']
df

Unnamed: 0,letter,first_name,string_nan,real_list,word_list
0,L,LOUISE,nca9j,"[Volker, Kira, Anja]",Christa Hannes
1,C,CHRISTIAN,Eqe8Äew,"[Tom, Tom, Bernd, Marco]",Anja Anja Anna Karin
2,T,TOM,et7A,"[Tanja, Martina, Lucas]",Elisabeth Tom
3,P,PHIL,eÖDtt,"[Maik, Louis]",Tanja Alina
4,M,MICHAEL,Z7KIs9R,,Anja Ulrich Günter
...,...,...,...,...,...
995,L,LOUIS,VDWjnpF,"[Mia, Tom, Tanja, Tim]",Tanja Denis Jürgen
996,J,JAKOB,Phee4qe,"[Anna, Tanja, Ali]",Theo Tom Tom
997,L,LENNARD,,"[Natalie, Sabine, Steffen, Joachim, Gertrud]",Pauline Claudia Bernd Emilie
998,C,CHARLOTTE,fLYwÖ,"[Anja, Michael]",Lilly Christiane Anja


## One column => 1-dimensional analysis

In [6]:
# analyse frequencies of the values in the column 'letter'
pak.analyse_freqs(df, 'letter')

Unnamed: 0,letter,letter_count,letter_percent,graph
0,A,123,12.3,######
1,T,118,11.8,#####
2,M,111,11.1,#####
3,L,69,6.9,###
4,S,68,6.8,###
5,J,57,5.7,##
6,E,51,5.1,##
7,K,48,4.8,##
8,C,43,4.3,##
9,H,36,3.6,#


## Two or more columns => 2-dimensional analysis

In [7]:
# analyse frequencies of the values in the columns 'letter, then 'first_name'
a = pak.analyse_freqs(df, ['letter','first_name',])
a

Unnamed: 0,letter,letter_count,letter_percent,first_name,first_name_count
0,A,123,12.3,"[ANJA, ANNA, ANDREA, ALBERT, AMELIE, AXEL, ANT...","[38, 29, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2..."
1,T,118,11.8,"[TOM, TANJA, TORSTEN, TONI, THOMAS, TOBIAS, TI...","[53, 35, 5, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1]"
2,M,111,11.1,"[MAJA, MALTE, MARKO, MARK, MARGARETHE, MARC, M...","[5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,L,69,6.9,"[LARA, LEAH, LOUISE, LUKAS, LUCY, LOUIS, LENA,...","[4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, ..."
4,S,68,6.8,"[SONJA, SABRINA, SUSANNE, SOFIE, SOPHIA, SWEN,...","[6, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, ..."
5,J,57,5.7,"[JOSEFINE, JONATHAN, JÖRG, JOACHIM, JOHN, JOHA...","[4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
6,E,51,5.1,"[EVA, ERICH, ERNA, ERWIN, ELLA, ELENA, ELISABE...","[4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ..."
7,K,48,4.8,"[KATJA, KARSTEN, KARIN, KEVIN, KATRIN, KIRA, K...","[4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, ..."
8,C,43,4.3,"[CHRISTOPHER, CHRISTEL, CHRISTA, CURT, CHARLOT...","[4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, ..."
9,H,36,3.6,"[HENDRIK, HELMUT, HEINRICH, HENRY, HELENA, HOL...","[4, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [8]:
# analyse frequencies of the values in the columns 'letter, then 'first_name' and 'string_nan'
# and limit the result
a = pak.analyse_freqs(df, ['letter','first_name','string_nan'], limits=[None,5,3])
a

Unnamed: 0,letter,letter_count,letter_percent,first_name,first_name_count,string_nan,string_nan_count
0,A,123,12.3,"[ANJA, ANNA, ANDREA, ALBERT, AMELIE]","[38, 29, 4, 3, 3]","[<NA>, L5dä, c0ü58L]","[14, 1, 1]"
1,T,118,11.8,"[TOM, TANJA, TORSTEN, TONI, THOMAS]","[53, 35, 5, 4, 4]","[<NA>, et7A, 0SpxaQ]","[9, 1, 1]"
2,M,111,11.1,"[MAJA, MALTE, MARKO, MARK, MARGARETHE]","[5, 4, 4, 4, 3]","[<NA>, Z7KIs9R, iE7ce]","[13, 1, 1]"
3,L,69,6.9,"[LARA, LEAH, LOUISE, LUKAS, LUCY]","[4, 4, 3, 3, 3]","[<NA>, nca9j, tzJOH0]","[11, 1, 1]"
4,S,68,6.8,"[SONJA, SABRINA, SUSANNE, SOFIE, SOPHIA]","[6, 4, 4, 4, 3]","[<NA>, UuzIuXÜ, 5z1i]","[2, 1, 1]"
5,J,57,5.7,"[JOSEFINE, JONATHAN, JÖRG, JOACHIM, JOHN]","[4, 4, 3, 3, 3]","[<NA>, hjoF48, LmeDL]","[5, 1, 1]"
6,E,51,5.1,"[EVA, ERICH, ERNA, ERWIN, ELLA]","[4, 4, 4, 3, 3]","[<NA>, pBoA, Opf1bRb]","[8, 1, 1]"
7,K,48,4.8,"[KATJA, KARSTEN, KARIN, KEVIN, KATRIN]","[4, 3, 3, 3, 3]","[k7ErsVX, RKGOJ, 5ÄjeS]","[1, 1, 1]"
8,C,43,4.3,"[CHRISTOPHER, CHRISTEL, CHRISTA, CURT, CHARLOTTE]","[4, 3, 3, 3, 3]","[<NA>, Eqe8Äew, KÄfE9]","[9, 1, 1]"
9,H,36,3.6,"[HENDRIK, HELMUT, HEINRICH, HENRY, HELENA]","[4, 3, 2, 2, 2]","[<NA>, JOas, 6wF0ew]","[3, 1, 1]"


In [9]:
# Position 2 and 3 swapped: no big difference => it's only a 2-dimensional analysis!
a = pak.analyse_freqs(df, ['letter','string_nan','first_name'], limits=[3,3,5])
a

Unnamed: 0,letter,letter_count,letter_percent,string_nan,string_nan_count,first_name,first_name_count
0,A,123,12.3,"[<NA>, L5dä, c0ü58L]","[14, 1, 1]","[ANJA, ANNA, ANDREA, ALBERT, AMELIE]","[38, 29, 4, 3, 3]"
1,T,118,11.8,"[<NA>, et7A, 0SpxaQ]","[9, 1, 1]","[TOM, TANJA, TORSTEN, TONI, THOMAS]","[53, 35, 5, 4, 4]"
2,M,111,11.1,"[<NA>, Z7KIs9R, iE7ce]","[13, 1, 1]","[MAJA, MALTE, MARKO, MARK, MARGARETHE]","[5, 4, 4, 4, 3]"


## Exploding lists and strings

In [10]:
# lists get exploded
a = pak.analyse_freqs(df, ['real_list','letter'], limits=[None,3] )
a

Unnamed: 0,real_list,real_list_count,real_list_percent,letter,letter_count
0,Tom,305,9.0,"[A, M, T]","[37, 32, 32]"
1,Anja,230,6.8,"[A, M, S]","[31, 28, 19]"
2,Tanja,212,6.2,"[T, A, M]","[28, 28, 27]"
3,Anna,137,4.0,"[A, M, N]","[21, 12, 10]"
4,Mike,13,0.4,"[B, A, T]","[2, 2, 2]"
...,...,...,...,...,...
428,Gustav,1,0.0,L,1
429,Luise,1,0.0,P,1
430,Emil,1,0.0,S,1
431,Manfred,1,0.0,N,1


In [11]:
# lists get exploded
a = pak.analyse_freqs(df, ['letter','real_list',])
a

Unnamed: 0,letter,letter_count,letter_percent,real_list,real_list_count
0,A,123,12.3,"[Tom, Anja, Tanja, Anna, None, Heinz, Erwin, M...","[37, 31, 28, 21, 6, 4, 4, 4, 4, 3, 3, 3, 3, 3,..."
1,T,118,11.8,"[Tom, Tanja, Anja, Anna, nan, None, Monika, Ko...","[32, 28, 18, 9, 8, 7, 4, 4, 3, 3, 3, 3, 3, 3, ..."
2,M,111,11.1,"[Tom, Anja, Tanja, Anna, None, nan, Neele, Jul...","[32, 28, 27, 12, 8, 5, 4, 4, 3, 3, 3, 3, 3, 3,..."
3,L,69,6.9,"[Tom, Anja, Tanja, Anna, None, Mathias, Eric, ...","[16, 11, 9, 7, 5, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2..."
4,S,68,6.8,"[Tom, Anja, Tanja, Anna, None, Lukas, Diana, M...","[21, 19, 13, 9, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, ..."
5,J,57,5.7,"[Tom, Tanja, Anja, Anna, Michelle, Paula, Bene...","[19, 16, 15, 9, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ..."
6,E,51,5.1,"[Tanja, Tom, Anna, Anja, nan, Jennifer, Doris,...","[16, 15, 6, 6, 5, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2..."
7,K,48,4.8,"[Tanja, Anja, Tom, Anna, Bastian, Martin, Edit...","[9, 9, 6, 5, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
8,C,43,4.3,"[Tom, Anja, Tanja, Marco, Dominic, Anna, Sebas...","[20, 12, 7, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2..."
9,H,36,3.6,"[Anja, Tom, Tanja, Dagmar, nan, Käthe, Waltrau...","[7, 6, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, ..."


In [12]:
# word_list (a string) can also be exploded, use the splits argument for this
a = pak.analyse_freqs(df, ['letter','word_list',], splits=[None,' '])
a

Unnamed: 0,letter,letter_count,letter_percent,word_list,word_list_count
0,A,123,12.3,"[Tanja, Tom, Anna, Anja, Hannes, Melissa, Mari...","[34, 34, 24, 21, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
1,T,118,11.8,"[Tom, Tanja, Anja, Anna, Susanne, Herbert, Ant...","[35, 31, 30, 14, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
2,M,111,11.1,"[Tom, Anja, Tanja, Anna, Birgit, Elli, Irmgard...","[31, 25, 14, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, ..."
3,L,69,6.9,"[Tom, Tanja, Anja, Anna, Sigrid, Sabine, Willy...","[23, 17, 13, 7, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, ..."
4,S,68,6.8,"[Tom, Anja, Tanja, Anna, Zoe, Marcel, Jacqueli...","[25, 19, 17, 7, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, ..."
5,J,57,5.7,"[Anja, Tom, Tanja, Anna, Sven, Chiara, Mariann...","[18, 18, 10, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
6,E,51,5.1,"[Tom, Anja, Tanja, Anna, Sophie, Franziska, Ca...","[20, 13, 10, 4, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, ..."
7,K,48,4.8,"[Tom, Anja, Tanja, Anna, Malte, Theo, Carolin,...","[19, 9, 5, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,..."
8,C,43,4.3,"[Tom, Anja, Anna, Tanja, Celine, Arne, Heiko, ...","[23, 10, 7, 5, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1..."
9,H,36,3.6,"[Tanja, Anja, Tom, Carl, Ralf, Anna, Katja, Ma...","[12, 8, 6, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1,..."


In [13]:
# word_list is exploded as first dimension
a = pak.analyse_freqs(df, cols=['word_list','letter'], splits=[' '], limits=[5,3])
a

Unnamed: 0,word_list,word_list_count,word_list_percent,letter,letter_count
0,Tom,315,9.2,"[T, A, M]","[35, 34, 31]"
1,Anja,229,6.7,"[T, M, A]","[30, 25, 21]"
2,Tanja,207,6.0,"[A, T, L]","[34, 31, 17]"
3,Anna,123,3.6,"[A, T, C]","[24, 14, 7]"
4,Albert,13,0.4,"[E, P, K]","[2, 2, 2]"


In [14]:
# string_nan is exploded into single characters
a = pak.analyse_freqs(df, 'string_nan', splits = [''] )
a

Unnamed: 0,string_nan,string_nan_count,string_nan_percent,graph
0,,1788,26.6,#############
1,i,150,2.2,#
2,u,131,1.9,
3,e,128,1.9,
4,o,124,1.8,
...,...,...,...,...
64,M,58,0.9,
65,4,57,0.8,
66,Q,57,0.8,
67,Z,50,0.7,


# Spielwiese