Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

correct a few bugs in lda.py and interest_generator.py

  • Loading branch information...
commit 0ee105f21ce44cdbfc93dd2532505a1d3b70cfb2 1 parent aa5bb6e
@chengdujin authored
View
11 extractor/media.py
@@ -220,16 +220,17 @@ class Segment:
'class to model a segment, including chinese, japanase and english'
def __init__(self, word, entry):
self.word = word
+ self.terms = []
# use mongodb id_ to relate segmented words to an item
- if 'id_' in entry.keys():
+ if 'id_' in entry:
self.id_ = entry['id_']
- if 'published' in entry.keys():
+ if 'published' in entry:
self.published = entry['published']
- if 'retweeted' in entry.keys():
+ if 'retweeted' in entry:
self.retweeted = entry['retweeted']
- if 'favorited' in entry.keys():
+ if 'favorited' in entry:
self.favorited = entry['favorited']
- if 'users' in entry.keys():
+ if 'users' in entry:
self.no_users = len(entry['users'])
def __str__(self):
View
24 generator/interest_generator.py
@@ -8,7 +8,7 @@
# @author Yuan JIN
# @contact chengdujin@gmail.com
# @since 2012.03.10
-# @latest 2012.03.19
+# @latest 2012.03.21
#
# reload the script encoding
@@ -17,7 +17,7 @@
sys.setdefaultencoding('UTF-8')
# CONSTANTS
-SOURCE = 'twitter/chengdujin.chinese'
+SOURCE = 'twitter/perryhau'
DB = '176.34.54.120:27017'
@@ -39,22 +39,24 @@ def read(source):
cursor = collection.find()
if cursor.count() > 0:
- docs = []
+ docs = []
for entry in cursor:
# wrap the segmented word in an Segment class
words = []
- if 'chinese' in entry.keys():
+ if 'chinese' in entry:
for word in entry['chinese']:
- words.append(Segment(word, entry))
- if 'latin' in entry.keys():
+ if word:
+ words.append(Segment(word, entry))
+ if 'latin' in entry:
for word in entry['latin']:
- words.extend(Segment(word, entry))
+ if word:
+ words.append(Segment(word, entry))
if words: # array of self-aware segmented word
docs.append(words)
- return docs
+ return docs
else:
- return Exception("[error] read: nothing is found!")
+ return Exception("[error] read: nothing is found!")
def generate(source):
'''
@@ -69,7 +71,7 @@ def generate(source):
topic_extractor = lda.LDA(seg_list)
#screen()
topic_extractor.learn()
- topic_extractor.publish()
-
+ topic_extractor.publish_topics()
+
if __name__ == '__main__':
generate(SOURCE)
View
2  generator/k_means.py
@@ -8,7 +8,7 @@
# @author Yuan JIN
# @contact chengdujin@gmail.com
# @since 2012.03.15
-# @latest 2012.03.16
+# @latest 2012.03.21
#
# reload the script encoding
View
38 generator/lda.py
@@ -9,19 +9,20 @@
# @author Yuan JIN
# @contact chengdujin@gmail.com
# @since 2012.03.18
-# @latest 2012.03.19
+# @latest 2012.03.21
#
+import random
# reload the script encoding
import sys
reload(sys)
sys.setdefaultencoding('UTF-8')
-TOPIC_NUMBER = 20
+TOPIC_NUMBER = 10
ALPHA = 2
BETA = .5
-ITERATION = 1000
-burnin = 500
+ITERATION = 500
+burnin = 200
class LDA(object):
'class to model lda computation'
@@ -64,7 +65,7 @@ def initialize(self):
dc = {}
# a word is an instance of Segment
for word in doc:
- rand_topic = random.randint(1, TOPIC_NUMBER)
+ rand_topic = random.randint(0, TOPIC_NUMBER-1)
word_vocab_id = self.vocab_index[word]
# build doc-vocab
dc[word_vocab_id] = rand_topic
@@ -103,6 +104,7 @@ def initialize(self):
def learn(self):
'lda implementation'
for it in xrange(0, ITERATION):
+ print 'processing %i of %i' % (it, ITERATION)
for doc_id in self.doc_vocab:
for word_vocab_id in self.doc_vocab[doc_id]:
# sample full conditional
@@ -138,7 +140,7 @@ def learn(self):
else:
self.doc_topic[doc_id][topic] += 1
- if topic not in topics:
+ if topic not in self.topics:
self.topics[topic] = 1
else:
self.topics[topic] += 1
@@ -148,9 +150,14 @@ def learn(self):
def publish_topics(self):
'generate a list of all topics for a corpus'
- pass
-
- def publish_topics_for_doc(self):
+ topic_word = self.generate_topic_word()
+ topics_linked = []
+ for topic in self.topics:
+ #print ','.join([term.word for term in topic_word[topic]])
+ topics_linked.append(topic_word[topic])
+ return topics_linked
+
+ def generate_topic_word(self):
'generate estimated topics for a document'
from collections import OrderedDict
@@ -160,7 +167,7 @@ def publish_topics_for_doc(self):
vt = self.vocab_topic[word_vocab_id]
sorted_vt = OrderedDict(sorted(vt.items(), key=lambda x: -x[1]))
for svt, (k, v) in enumerate(sorted_vt.items()):
- if svt > 3:
+ if svt > 10:
break
else:
if k not in topic_vocab:
@@ -179,16 +186,17 @@ def publish_topics_for_doc(self):
# combine topic-vocab with index_vocab to create a topic_id --> vocabulary list map
topic_word = {}
for topic_id in topic_vocab:
- topic_word[topic_id] = ''
+ topic_word[topic_id] = []
tv = topic_vocab[topic_id]
sorted_tv = OrderedDict(sorted(tv.items(), key=lambda x: -x[1]))
for stv, (k, v) in enumerate(sorted_tv.items()):
- if stv > 3:
+ if stv > 10:
break
else:
- topic_word[topic_id] += index_vocab[k] + ','
+ topic_word[topic_id].append(self.index_vocab[k])
+ return topic_word
- # publish
+ '''# publish
for doc_id, doc in enumerate(self.docs):
if doc_id in self.doc_topic and len(doc) > 2:
topic_list = self.doc_topic[doc_id]
@@ -201,7 +209,7 @@ def publish_topics_for_doc(self):
wanted_strings.append(topic_word[k])
print str(doc_id + 1) + '.' + ','.join(doc)
print '>> ' + ' '.join(wanted_strings)
- print
+ print'''
if __name__ == '__main__':
pass
View
BIN  generator/lda.pyc
Binary file not shown

0 comments on commit 0ee105f

Please sign in to comment.
Something went wrong with that request. Please try again.