## <font color='green'>20newscroups - LDA<font>

### <font color='green'> 1. Description<font>

Topic analysis of documents using LDA.

It uses 20 newsgroup dataset that can be taken from scikit-learn function. LDA is applied to this dataset for topic analysis. 

The algorithm extracts the topics of the document. The program shows the words that belongs to the topics to see if the topic analysis is
working properly.

### <font color='green'> 2. Data Preprocessing<font>

In [1]:
# prepare data
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

dataset = fetch_20newsgroups(subset='all',
                             remove=('headers', 'footers', 'quotes'),
                             shuffle=True, random_state=42)

vectorizer = CountVectorizer(min_df=5, stop_words='english')
vec = vectorizer.fit(dataset.data)
X = vec.transform(dataset.data)

### <font color='green'> 3. Implementation using Frovedis<font>

In [2]:
# train
import os, time
from frovedis.decomposition import LatentDirichletAllocation as frovLatentDirichletAllocation 
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -ve 2 -np 8 {}".format(os.environ['FROVEDIS_SERVER']))

n_components=20
frov_lda = frovLatentDirichletAllocation(n_components=n_components, max_iter=100)
t1 = time.time()
frov_lda.fit(X)
t2 = time.time()
print ("train time: {:.3f} sec".format(t2-t1))

train time: 10.070 sec


In [3]:
# predict
feature_names = vec.get_feature_names()
sorted = np.argsort(frov_lda.components_, axis=1)[:, ::-1]
num_words = 10
sorted_head = sorted[:,0:num_words]
for i in range(n_components):
    to_print = []
    for j in range(num_words):
        to_print.append(feature_names[sorted_head[i,j]])
    print ("topic {0}: {1}".format(i, to_print))

FrovedisServer.shut_down()

topic 0: ['db', 'cx', 'w7', 'ah', 'bh', 'mv', 'c_', 'lk', 'uw', 'chz']
topic 1: ['00', '50', '10', 'new', '1st', '20', 'adl', 'appears', 'man', 'art']
topic 2: ['use', 'just', 'like', 'power', 'used', 'don', 'time', 'water', 'good', 'know']
topic 3: ['don', 'just', 'like', 'think', 'people', 'know', 'good', 'say', 'really', 've']
topic 4: ['file', 'image', 'files', 'jpeg', 'mail', 'ftp', 'use', 'format', 'gif', 'list']
topic 5: ['armenian', 'israel', 'jews', 'turkish', 'people', 'armenians', 'war', 'israeli', 'jewish', 'government']
topic 6: ['god', 'jesus', 'people', 'bible', 'does', 'christian', 'church', 'believe', 'christ', 'christians']
topic 7: ['key', 'encryption', 'chip', 'use', 'clipper', 'keys', 'government', 'security', 'privacy', 'public']
topic 8: ['said', 'people', 'went', 'know', 'didn', 'time', 'just', 'did', 'don', 'like']
topic 9: ['health', 'medical', 'disease', 'use', 'cancer', 'patients', 'drug', 'study', 'number', 'hiv']
topic 10: ['drive', 'like', 'new', 'thanks'

### <font color='green'> 4. Implementation using scikit-learn<font>

In [4]:
# train
import os, time
from sklearn.decomposition import LatentDirichletAllocation as skLatentDirichletAllocation

n_components=20
sk_lda = skLatentDirichletAllocation(n_components=n_components, max_iter=100)
t1 = time.time()
sk_lda.fit(X)
t2 = time.time()
print ("train time: {:.3f} sec".format(t2-t1))

train time: 533.943 sec


In [5]:
# predict
feature_names = vec.get_feature_names()
sorted = np.argsort(sk_lda.components_, axis=1)[:, ::-1]
num_words = 10
sorted_head = sorted[:,0:num_words]
for i in range(n_components):
    to_print = []
    for j in range(num_words):
        to_print.append(feature_names[sorted_head[i,j]])
    print ("topic {0}: {1}".format(i, to_print))

topic 0: ['medical', 'health', 'disease', 'cancer', 'patients', 'use', 'drug', '1993', 'hiv', 'treatment']
topic 1: ['edu', 'ftp', 'com', 'available', 'mail', 'server', 'graphics', 'pub', 'list', 'software']
topic 2: ['car', 'just', 'like', 'good', 'power', 'don', 'used', 'use', 'new', 'bike']
topic 3: ['db', 'cx', 'w7', 'ah', 'hz', 'bh', 'mv', 'c_', 'uw', 't7']
topic 4: ['edu', 'com', 'ms', 'ed', 'myers', 'cs', 'university', 've', 'david', 'banks']
topic 5: ['space', 'nasa', 'earth', 'research', 'data', 'launch', 'program', 'science', 'center', 'university']
topic 6: ['know', 'thanks', 'like', 'does', 'just', 'don', 'problem', 've', 'help', 'mail']
topic 7: ['gun', 'guns', 'states', 'file', 'state', 'control', 'crime', 'firearms', 'weapons', 'law']
topic 8: ['drive', 'dos', 'card', 'disk', 'scsi', 'windows', 'pc', 'mac', 'software', 'video']
topic 9: ['government', 'key', 'use', 'law', 'people', 'encryption', 'public', 'chip', 'security', 'clipper']
topic 10: ['armenian', 'homosexual'