In [1]:
from __future__ import unicode_literals
from textacy import fileio, preprocess, extract, Corpus

In [2]:
summary_docs = fileio.read_file_lines('/home/SMRT-labeled/summary-docs')
summary_meta = fileio.read_json_lines('/home/SMRT-labeled/summary-meta')

In [3]:
text = [t for t in summary_docs]
meta = [m for m in summary_meta][0]

corpus_train = Corpus('en', texts = text, metadatas = meta)

In [4]:
for k in meta[0]:
    print k, meta[0][k]

Category smrt corporate
Topic corporate & financials
Tonality neutral


In [5]:
topiclist = [t['Topic'] for t in meta]

In [6]:
from collections import Counter
Counter(topiclist)

Counter({u'commuter behaviour': 16,
         u'corporate & financials': 102,
         u'corporate social responsibility': 12,
         u'crisis / delays & distributions / accidents / safety': 323,
         u'csr': 40,
         u'customer satisfaction': 5,
         u'customer service': 16,
         u'delay/disruption': 85,
         u'facilities': 24,
         u'facilities & services': 264,
         u'fares': 13,
         u'financials': 42,
         u'fines': 3,
         u'general / others': 153,
         u'labour': 31,
         u'labour & union': 101,
         u'rail & engineering': 264,
         u'regulations': 23,
         u'regulations & ops': 25,
         u'repair/maintenance': 25,
         u'safety/accident': 45,
         u'service announcements': 40,
         u'service excellence': 37,
         u'unknown': 77})

In [7]:
#
filter_a = [(i == u'crisis / delays & distributions / accidents / safety') for i in topiclist]
filter_b = [(i == u'delay/disruption') for i in topiclist]
filter_c = [(i == u'safety/accident') for i in topiclist]

filter_combine = map(lambda x:(x[0] or x[1] or x[2]), zip(filter_a, filter_b, filter_c))
#filter_combine.count(True)

text_train_filter = [x[0] for x in zip(text, filter_combine) if x[1]]
meta_train_filter = [x[0] for x in zip(topiclist, filter_combine) if x[1]]

In [8]:
from textacy import vsm, tm

corpus_train_filter = Corpus('en', texts = text_train_filter, metadatas = meta_train_filter)

In [9]:
#topic model
terms_lists = (doc.to_terms_list(ngrams={1, 2}, named_entities=True, as_strings=True) for doc in corpus_train_filter)
doc_term_matrix, id2term = vsm.doc_term_matrix(terms_lists, weighting='tfidf', normalize=True, smooth_idf=True, min_df=1, max_df=0.95, max_n_terms=15000)

In [10]:
unsp_topic_model = tm.TopicModel('lda', n_topics=20)
unsp_topic_model.fit(doc_term_matrix)
unsp_topic_model

TopicModel(n_topics=20, model=LatentDirichletAllocation)

In [11]:
doc_topic_matrix = unsp_topic_model.transform(doc_term_matrix)
for topic_idx, top_terms in unsp_topic_model.top_topic_terms(id2term, topics=range(20)):
    print('topic', topic_idx, ':', '   '.join(top_terms))

(u'topic', 0, u':', u'suspend pending   arrest driver   pending inquiry   driver suspend   bus accident   woman die   luxury   luxury taxi   mount road   affected vehicle')
(u'topic', 1, u':', u'door   train   new   smrt   besar   sensor   breakdown   maintenance work   panjang lrt   deepavali')
(u'topic', 2, u':', u"saf   wanbao   saf vehicle   vehicle   collide   involve   today   's death   vehicle collide   bus")
(u'topic', 3, u':', u'singaporeans   10:55pm   today   ho   independence   social lab   elaine ho   lab   research analyst   touchpoint')
(u'topic', 4, u':', u'14   bus   oil   oil spill   858   spill   854   lorry   door   wanbao')
(u'topic', 5, u':', u'construction site   construction   burn injury   tanjong pagar   tanjong   pagar   site   tanjong pagar centre   pagar centre   left hand')
(u'topic', 6, u':', u'750-volt   tecs   site   ngan   9.30am   3-bu   money   build site   pagar build   tg pagar')
(u'topic', 7, u':', u'road rage   rage case   rage   jail   st   mr 

In [12]:
doc2topic = unsp_topic_model.get_doc_topic_matrix(doc_term_matrix, normalize=True)
doc2topic_index = [i.argmax() for i in doc2topic]

In [13]:
Counter(zip(doc2topic_index, meta_train_filter))

Counter({(0, u'crisis / delays & distributions / accidents / safety'): 2,
         (2, u'crisis / delays & distributions / accidents / safety'): 8,
         (2, u'delay/disruption'): 1,
         (3, u'delay/disruption'): 2,
         (4, u'crisis / delays & distributions / accidents / safety'): 14,
         (4, u'safety/accident'): 2,
         (5, u'crisis / delays & distributions / accidents / safety'): 5,
         (5, u'safety/accident'): 1,
         (6, u'crisis / delays & distributions / accidents / safety'): 5,
         (6, u'safety/accident'): 1,
         (7, u'crisis / delays & distributions / accidents / safety'): 6,
         (8, u'crisis / delays & distributions / accidents / safety'): 1,
         (9, u'crisis / delays & distributions / accidents / safety'): 3,
         (9, u'safety/accident'): 1,
         (10, u'crisis / delays & distributions / accidents / safety'): 8,
         (10, u'safety/accident'): 2,
         (12, u'crisis / delays & distributions / accidents / safety')

In [14]:
#review raw text for each 'unsp-topic'
index_unsp_topic_i = [x[0] for x in zip(range(len(doc2topic_index)), doc2topic_index) if x[1] == 4]
[text_train_filter[x] for x in index_unsp_topic_i]

[u'A massive operation was mounted to clean up and repave part of Bukit Timah Expressway after an oil spill due to an accident affected many bus services from the SMRT Woodlands bus interchange yesterday morning. SMRT said the bus services affected are: 178, 187, 858, 911, 913, 925, 950, 960, 961, 963 and 966. The SCDF said initial on-site investigations did not identify any oil spillage. But three hours later at 6.07am, it received another call, this time about a resulting oil spill from the accident, and dispatched two fire engines.\n',
 u'A 29-year-old woman died after being struck by an SBS Transit bus in Toh Tuck Avenue on Thursday afternoon. Ms Winny Pratiwi was believed to be out for lunch when the accident happened at around 12.30pm. A 64-year-old bus driver was arrested for allegedly causing death by a rash act, said the police. The recent accidents involving buses follow one that took place about two months ago, when an SMRT bus hit the back of another SMRT bus at a Lentor Av