In [1]:
from __future__ import unicode_literals
from textacy import fileio, preprocess, extract, Corpus

In [2]:
summary_docs = fileio.read_file_lines('/home/SMRT-labeled/summary-docs')
summary_meta = fileio.read_json_lines('/home/SMRT-labeled/summary-meta')

In [3]:
text = [t for t in summary_docs]
meta = [m for m in summary_meta][0]

corpus_train = Corpus('en', texts = text, metadatas = meta)

In [4]:
for k in meta[0]:
    print k, meta[0][k]

Category smrt corporate
Topic corporate & financials
Tonality neutral


In [5]:
topiclist = [t['Topic'] for t in meta]

In [6]:
from collections import Counter
Counter(topiclist)

Counter({u'commuter behaviour': 16,
         u'corporate & financials': 102,
         u'corporate social responsibility': 12,
         u'crisis / delays & distributions / accidents / safety': 323,
         u'csr': 40,
         u'customer satisfaction': 5,
         u'customer service': 16,
         u'delay/disruption': 85,
         u'facilities': 24,
         u'facilities & services': 264,
         u'fares': 13,
         u'financials': 42,
         u'fines': 3,
         u'general / others': 153,
         u'labour': 31,
         u'labour & union': 101,
         u'rail & engineering': 264,
         u'regulations': 23,
         u'regulations & ops': 25,
         u'repair/maintenance': 25,
         u'safety/accident': 45,
         u'service announcements': 40,
         u'service excellence': 37,
         u'unknown': 77})

In [7]:
#
filter_a = [(i == u'crisis / delays & distributions / accidents / safety') for i in topiclist]
filter_b = [(i == u'delay/disruption') for i in topiclist]
filter_c = [(i == u'safety/accident') for i in topiclist]

filter_combine = map(lambda x:(x[0] or x[1]), zip(filter_b, filter_c))
#filter_combine.count(True)

text_train_filter = [x[0] for x in zip(text, filter_combine) if x[1]]
meta_train_filter = [x[0] for x in zip(topiclist, filter_combine) if x[1]]

In [8]:
from textacy import vsm

corpus_train_filter = Corpus('en', texts = text_train_filter, metadatas = meta_train_filter)

In [9]:
#topic model
terms_lists = (doc.to_terms_list(ngrams={1, 2}, named_entities=True, as_strings=True) for doc in corpus_train_filter)
doc_term_matrix, id2term = vsm.doc_term_matrix(terms_lists, weighting='tf', normalize=True, smooth_idf=True, min_df=1, max_df=0.75, max_n_terms=15000)



In [10]:
doc_term_matrix

<130x6158 sparse matrix of type '<type 'numpy.float64'>'
	with 14318 stored elements in Compressed Sparse Row format>

In [11]:
#load lable data
label_key = u'Category'

label_list = meta_train_filter
unique_label = list(set(label_list))
label2id = dict(zip(unique_label, range(len(unique_label))))
label_id = map(lambda x:label2id[x], label_list)

In [24]:
label2id

{u'delay/disruption': 0, u'safety/accident': 1}

In [13]:
#MLPClassifier requires sklearn >=0.18
#classifier
#from sklearn.neural_network import MLPClassifier
#regression
from sklearn.neural_network import MLPRegressor
#use sknn instead
#from sknn.mlp import Classifier, Layer
import numpy as np

def id2array(id, a_len):
    result = np.zeros(a_len)
    result[id] = 1
    return result

labeltype = len(label2id.keys())

msg_x = doc_term_matrix.toarray()
msg_y = np.array([id2array(x, labeltype) for x in label_id])

clf = MLPRegressor(activation='logistic', solver='adam', alpha=1e-5, batch_size=10, hidden_layer_sizes=(50, ), random_state=1)
clf.fit(msg_x, msg_y)

MLPRegressor(activation=u'logistic', alpha=1e-05, batch_size=10, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver=u'adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [14]:
clf.score(msg_x, msg_y)

0.99765301098971648

In [15]:
#re-labelling unlabelled data
text_test_filter = [x[0] for x in zip(text, filter_a) if x[1]]
corpus_test = Corpus('en', texts = text_test_filter)

terms_lists_test = (doc.to_terms_list(ngrams={1, 2}, named_entities=True, as_strings=True) for doc in corpus_test)
dtm_test, i2t_test = vsm.doc_term_matrix(terms_lists_test, weighting='tf', normalize=True, smooth_idf=True, min_df=1, max_df=0.75, max_n_terms=15000)

In [16]:
#build new index
test2train = dict()
for (i, j) in i2t_test.items():
    if j in id2term.values():
        train_i = id2term.keys()[id2term.values().index(j)]
        test2train[i]=train_i

test_feature = np.zeros((dtm_test.shape[0], len(id2term)))
dtm_test_array = dtm_test.toarray()

for (i, j) in test2train.items():
    test_feature[:, j] = dtm_test_array[:, i]

In [19]:
auto_label = clf.predict(test_feature)

In [23]:
import csv

with open('re-label-output', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for i_row in zip(text_test_filter, auto_label): 
        wr.writerow(i_row)

In [28]:
#amend original meta
print(label2id)

def assignNewLabel(v):
    tmpv = v[0] - v[1]
    if(abs(tmpv) <= 0.1):
        return u'crisis'
    elif(tmpv>0.0):
        return label2id.keys()[label2id.values().index(0)]
    else:
        return label2id.keys()[label2id.values().index(1)]

new_label = map(assignNewLabel, auto_label)
#filter_a

{u'delay/disruption': 0, u'safety/accident': 1}


In [34]:
new_label_index = [x[0] for x in zip(range(len(filter_a)), filter_a) if x[1]]
for i in zip(new_label, new_label_index):
    meta[i[1]]['Topic'] = i[0]

In [35]:
from textacy import fileio

fileio.write_json(meta, '/home/SMRT-labeled/summary-meta-relabel')