#### Sample Code for Pachinko Allocation Model

In [1]:
import pandas as pd

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

import sys
sys.path.append('C:/github/PAM_Cython')
from pam_cython import pam

c:\github\PAM_Cython\pam_cython
!! Pachinko Allocation Model!!


***
Toy data preparation

In [2]:
n_samples = 100
n_features = 50
X, _ = make_multilabel_classification(random_state=0,n_samples=n_samples,n_features=n_features, n_classes=10)
X_df = pd.DataFrame(X,index=['sample_{}'.format(i) for i in range(n_samples)],columns=['word_{}'.format(i) for i in range(n_features)])
X_df

Unnamed: 0,word_0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8,word_9,...,word_40,word_41,word_42,word_43,word_44,word_45,word_46,word_47,word_48,word_49
sample_0,1.0,2.0,2.0,0.0,0.0,2.0,3.0,0.0,1.0,1.0,...,1.0,2.0,0.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0
sample_1,0.0,4.0,2.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,...,0.0,0.0,1.0,0.0,1.0,3.0,5.0,4.0,0.0,1.0
sample_2,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,1.0
sample_3,1.0,2.0,3.0,1.0,1.0,4.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,4.0,0.0,0.0,0.0,1.0,1.0,0.0
sample_4,1.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,...,1.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sample_95,2.0,0.0,2.0,1.0,0.0,3.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,1.0,3.0,3.0,1.0,2.0,0.0,1.0
sample_96,1.0,1.0,0.0,3.0,0.0,0.0,2.0,2.0,1.0,0.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,3.0,0.0,0.0
sample_97,0.0,1.0,2.0,0.0,1.0,3.0,1.0,0.0,1.0,1.0,...,1.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0
sample_98,1.0,5.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,5.0,2.0,0.0,0.0


***
Conduct PAM

In [3]:
dat = pam.PAM(S=2,K=5,alpha0=0.01,alpha1=0.01,beta=0.1,random_state=123)
dat.freq_df2bw(freq_df=X_df)
dat.set_params(seed_topics={},initial_conf=1.0)
dat.inference()

***
Summarize the results

In [4]:
upper_topic = pd.DataFrame(dat.get_theta0(),index=X_df.index)
lower_topic = pd.DataFrame(dat.get_theta1(),index=X_df.index)
display(upper_topic)
display(lower_topic)

Unnamed: 0,0,1
sample_0,0.574408,0.425592
sample_1,0.400076,0.599924
sample_2,0.666491,0.333509
sample_3,0.538434,0.461566
sample_4,0.422288,0.577712
...,...,...
sample_95,0.565990,0.434010
sample_96,0.384700,0.615300
sample_97,0.696295,0.303705
sample_98,0.633249,0.366751


Unnamed: 0,0,1,2,3,4
sample_0,0.254938,0.233699,0.128233,0.107104,0.276025
sample_1,0.200042,0.279345,0.299293,0.120580,0.100740
sample_2,0.084560,0.084395,0.057025,0.606995,0.167024
sample_3,0.173304,0.096802,0.077795,0.173265,0.478833
sample_4,0.133915,0.266046,0.023668,0.222045,0.354326
...,...,...,...,...,...
sample_95,0.263753,0.244902,0.076337,0.263714,0.151294
sample_96,0.096916,0.211384,0.211474,0.115974,0.364251
sample_97,0.161006,0.373801,0.090016,0.196453,0.178725
sample_98,0.183469,0.117105,0.200014,0.299398,0.200014
