In [1]:
from __future__ import unicode_literals
from textacy import fileio, preprocess, extract, Corpus

In [2]:
summary_docs = fileio.read_file_lines('/home/SMRT-labeled/summary-docs')
summary_meta = fileio.read_json_lines('/home/SMRT-labeled/summary-meta')

In [3]:
text = [t for t in summary_docs]
meta = [m for m in summary_meta][0]

print(len(text), len(meta))

(1766, 1766)


In [4]:
for k in meta[0]:
    print k, meta[0][k]

Category smrt corporate
Topic corporate & financials
Tonality neutral


In [5]:
#load lable data
label_key = u'Category'

label_list = map(lambda x:x[label_key], meta)

In [6]:
import collections

collections.Counter(label_list)

Counter({u'competitors': 1,
         u'forum': 66,
         u'overseas news': 1,
         u'public transport industry': 2,
         u'smrt buses': 368,
         u'smrt corporate': 533,
         u'smrt taxis': 77,
         u'smrt trains': 673,
         u'taxi': 32,
         u'unknown': 13})

In [7]:
#re-label: step 1 -- remove 'forum' and text including 'Same as'
text = [x[0] for x in zip(text, label_list) if ((x[1] != u'forum') and (x[1] != u'unknown'))]
label_list = [x for x in label_list if ((x != u'forum') and (x != u'unknown'))]

def textStartWith(wds):
    if((wds.find("Same as")) > -1):
        return True
    else:
        return False

maskKeyword = map(textStartWith, text)
text = [x[1] for x in zip(maskKeyword, text) if(not x[0])]
label_list = [x[1] for x in zip(maskKeyword, label_list) if(not x[0])]

print(len(text), len(label_list))

(1428, 1428)


In [8]:
#re-label:step 2 -- filter 'smrt x' into 'x' and 'smrt x'
def filterKeywords(wds):
    if((wds.find("smrt") > -1) or (wds.find("SMRT")>-1)):
        return True
    return False

maskKeyword = map(filterKeywords, text)
label_list = [u'trains' if ((not x[0]) and (x[1] == u'smrt trains')) else x[1] for x in zip(maskKeyword, label_list)] 
label_list = [u'buses' if ((not x[0]) and (x[1] == u'smrt buses')) else x[1] for x in zip(maskKeyword, label_list)]
label_list = [u'public transport industry' if ((not x[0]) and (x[1] == u'smrt corporate')) else x[1] for x in zip(maskKeyword, label_list)]

collections.Counter(label_list)

Counter({u'buses': 43,
         u'competitors': 1,
         u'overseas news': 1,
         u'public transport industry': 35,
         u'smrt buses': 265,
         u'smrt corporate': 427,
         u'smrt taxis': 62,
         u'smrt trains': 474,
         u'taxi': 32,
         u'trains': 88})

In [9]:
unique_label = list(set(label_list))
label2id = dict(zip(unique_label, range(len(unique_label))))
label_id = map(lambda x:label2id[x], label_list)

corpus_train = Corpus('en', texts = text)

In [10]:
#topic model
from textacy import vsm

terms_lists = (doc.to_terms_list(ngrams={2, 3}, named_entities=True, as_strings=True) for doc in corpus_train)
doc_term_matrix, id2term = vsm.doc_term_matrix(terms_lists, weighting='tf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95, max_n_terms=15000)



In [11]:
doc_term_matrix

<1428x9857 sparse matrix of type '<type 'numpy.float64'>'
	with 42673 stored elements in Compressed Sparse Row format>

In [12]:
#MLPClassifier requires sklearn >=0.18
#classifier
#from sklearn.neural_network import MLPClassifier
#regression
from sklearn.neural_network import MLPRegressor
#use sknn instead
#from sknn.mlp import Classifier, Layer
import numpy as np

def id2array(id, a_len):
    result = np.zeros(a_len)
    result[id] = 1
    return result

labeltype = len(label2id.keys())

msg_x = doc_term_matrix.toarray()
msg_y = np.array([id2array(x, labeltype) for x in label_id])

In [13]:
clf = MLPRegressor(activation='logistic', solver='adam', alpha=1e-5, batch_size=20, hidden_layer_sizes=(500, 120), random_state=1)
clf.fit(msg_x, msg_y)

MLPRegressor(activation=u'logistic', alpha=1e-05, batch_size=20, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 120), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver=u'adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [14]:
clf.score(msg_x, msg_y)

0.97842184167775381

In [15]:
#save model
from sklearn.externals import joblib

savefiles = joblib.dump(clf, '/home/pretrain-model/category-ann-reg.pkl') 

#save meta
fileio.write_json(label2id, '/home/pretrain-model/category-label2id-ann-reg')
fileio.write_json(id2term, '/home/pretrain-model/category-id2term-ann-reg')