#### Sample Code for Pachinko Allocation Model (PAM)

In [1]:
import pandas as pd

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

import sys
sys.path.append('C:/github/PAM_Cython')
from pam_cython import pam

c:\github\PAM_Cython\pam_cython
!! Pachinko Allocation Model!!


***
##### Toy data preparation
- More information is availabel in the [LDA repository](https://github.com/lda-project/lda).
- Samples (titles) are in row and vocabularies are in columns.

In [2]:
import lda
import lda.datasets

X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
X.shape
X_df = pd.DataFrame(X,index=titles,columns=vocab)

***
##### Conduct PAM

In [3]:
S=2
K=5
dat = pam.PAM(S=S,K=K,alpha0=0.01,alpha1=0.01,beta=0.1,random_state=123)
dat.freq_df2bw(freq_df=X_df)
dat.set_params(seed_topics={},initial_conf=1.0)
dat.inference()

***
##### Summarize the results

1. The document-topic distributions are available in model.thea0 and model.theta1

In [4]:
upper_doc_topic = pd.DataFrame(dat.get_theta0(),index=X_df.index,columns=['Topic {}'.format(i) for i in range(S)])
lower_doc_topic = pd.DataFrame(dat.get_theta1(),index=X_df.index,columns=['Topic {}'.format(i) for i in range(K)])
display(upper_doc_topic)
display(lower_doc_topic)

Unnamed: 0,Topic 0,Topic 1
0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20,0.385984,0.614016
"1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21",0.639667,0.360333
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23,0.624453,0.375547
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25,0.570036,0.429964
"4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25",0.427519,0.572481
...,...,...
390 CANADA: FEATURE - French-speaking Quebec celebrates Irish heritage. QUEBEC CITY 1997-08-14,0.601715,0.398285
391 BULGARIA: FEATURE - Bulgarian opera stars are enduring export. SOFIA 1997-08-15,0.500000,0.500000
"392 USA: Fans end Elvis Presley fete with concert. MEMPHIS, Tenn 1997-08-16",0.374224,0.625776
393 UK: Volcano buries studio where rock legends recorded. LONDON 1997-08-18,0.488690,0.511310


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4
0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20,0.197374,0.298069,0.162348,0.149214,0.192995
"1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21",0.161880,0.139886,0.264511,0.205865,0.227858
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23,0.198316,0.202527,0.282558,0.168830,0.147770
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25,0.251108,0.145035,0.226999,0.169143,0.207715
"4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25",0.152489,0.156200,0.156200,0.223013,0.312098
...,...,...,...,...,...
390 CANADA: FEATURE - French-speaking Quebec celebrates Irish heritage. QUEBEC CITY 1997-08-14,0.199136,0.311491,0.263956,0.151601,0.073817
391 BULGARIA: FEATURE - Bulgarian opera stars are enduring export. SOFIA 1997-08-15,0.204220,0.176090,0.225315,0.207735,0.186639
"392 USA: Fans end Elvis Presley fete with concert. MEMPHIS, Tenn 1997-08-16",0.264348,0.245042,0.199999,0.071307,0.219305
393 UK: Volcano buries studio where rock legends recorded. LONDON 1997-08-18,0.086184,0.185547,0.185547,0.348142,0.194581


2. The topic-word distributions are available in model.phi

In [5]:
lower_topic_word = pd.DataFrame(dat.get_phi(),index=['Topic {}'.format(i) for i in range(K)],columns = X_df.columns)
display(lower_topic_word)

Unnamed: 0,church,pope,years,people,mother,last,told,first,world,year,...,novelist,smoke,sentimental,tale,lawrence,cristina,50-year-old,gruelling,rapidly,jailed
Topic 0,0.006362,0.005612,0.004573,0.002898,0.004111,0.003476,0.003187,0.002032,0.002552,0.00336,...,1.2e-05,1.2e-05,1.2e-05,1.2e-05,0.0003,1.2e-05,1.2e-05,1.2e-05,1.2e-05,6.9e-05
Topic 1,0.007341,0.003534,0.003932,0.00325,0.003307,0.004614,0.002625,0.002682,0.003136,0.003648,...,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05,6.8e-05,1.1e-05,0.000182,0.000239
Topic 2,0.006742,0.008021,0.004739,0.003015,0.004016,0.002681,0.002904,0.003905,0.003516,0.003682,...,1.1e-05,0.000178,0.000289,0.000234,1.1e-05,1.1e-05,1.1e-05,0.000122,0.000122,1.1e-05
Topic 3,0.00573,0.007356,0.00259,0.004272,0.00416,0.003319,0.003263,0.003543,0.00388,0.002086,...,6.7e-05,0.000123,1.1e-05,6.7e-05,1.1e-05,0.000236,0.000236,0.000179,1.1e-05,1.1e-05
Topic 4,0.009583,0.005709,0.005025,0.00588,0.003031,0.003829,0.004626,0.004398,0.002803,0.002803,...,0.000239,1.1e-05,1.1e-05,1.1e-05,1.1e-05,6.8e-05,1.1e-05,1.1e-05,1.1e-05,1.1e-05
