demo : Reuters newswire topics (Keras dataset)

In [1]:
%matplotlib inline
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.


In [2]:
import sys
sys.path.append('/home/admin/github/wordroid.sblo.jp/lib')
from feature_eng import WordAndDoc2vec

In [3]:
import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter
import gc

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim
from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.sequence import skipgrams
import tensorflow as tf

In [4]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.light_palette(color, as_cmap=True)
    plt.hexbin(x, y, cmap=cmap, **kwargs)
def scatter(x, y, color, **kwargs):
    plt.scatter(x, y, marker='.')

### load data

In [5]:
from keras.datasets import reuters

In [6]:
word_index = reuters.get_word_index()
word_index2 = dict([(k, v+3) for k, v in word_index.items()])
word_dic = gensim.corpora.Dictionary([['<padding>', '<start_char>', '<oov_char>'],], prune_at=None)
word_dic.token2id.update(word_index2)

In [7]:
word_dic[27595]

'mcgrath'

In [8]:
word_dic[4]

'the'

In [9]:
(doc, cat), (doc_test, cat_test) = reuters.load_data(test_split=0.0, start_char=None)

In [10]:
' '.join([word_dic[ee] for ee in doc[0]])

'mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [11]:
doc_dic = gensim.corpora.Dictionary(prune_at=None)
doc_dic.token2id.update(dict([('d'+str(ee+1), ee) for ee in range(len(doc))]))

In [12]:
doc_seq = [[word_dic[ee] for ee in doc[ii]] for ii in range(len(doc))]

In [13]:
cat
cls = pd.Series(['c'+str(c) for c in cat])

### create WordAndDoc2vec instance

In [14]:
wd2v = WordAndDoc2vec(doc_seq, word_dic, doc_dic, logging=False)
wd2v

len(doc_seq) >>> 11228
max(doc_dic.keys()) + 1 >>> 11228
num_features >>> 30983
corpus_csr.shape >>> (11228, 30983)
creating tfidf...
<feature_eng.neg_smpl.MySparseMatrixSimilarity object at 0x7fd91a3fa9b0>
<feature_eng.neg_smpl.Dic4seq object at 0x7fd900a1c4a8>


<feature_eng.neg_smpl.WordAndDoc2vec at 0x7fd901972d30>

### create model

In [15]:
num_features = 96
wd2v.make_model(max_num_prod=5, num_neg=3, num_features=num_features,
                embeddings_val=0.5, gamma=0.0)

{'model': <keras.engine.training.Model at 0x7fd90062ce80>,
 'model_neg': <keras.engine.training.Model at 0x7fd900a1c978>,
 'model_prob0': <keras.engine.training.Model at 0x7fd900e22dd8>,
 'model_prob2': <keras.engine.training.Model at 0x7fd900eb85c0>,
 'model_prod': <keras.engine.training.Model at 0x7fd9001fa748>,
 'model_user': <keras.engine.training.Model at 0x7fd9001fa1d0>}

### start train
note : it will take a long time...

In [16]:
wd2v.train(epochs=250, verbose=0)

len(seq) >>> 351


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## similarity

In [17]:
sim = wd2v.sim
print(sim.num_features)
print(sim.sim_row)
sim

96
MatrixSimilarity<11228 docs, 96 features>


<feature_eng.neg_smpl.WordAndDocSimilarity at 0x7fd901972240>

### get feature vector of document 'd1'

In [18]:
'''d1'''
' '.join([word_dic[ee] for ee in doc[sim.row_dic.token2id['d1']]])

'mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

In [19]:
query = sim.sim_row.index[sim.row_dic.token2id['d1']]
query

array([ 0.02623753,  0.06126883,  0.02948295, -0.07886726, -0.15339448,
        0.01453855,  0.14083195, -0.14484602,  0.04234595,  0.08980165,
       -0.07618539, -0.15400571,  0.08003283, -0.05088439,  0.14224713,
        0.02182725, -0.02481675,  0.01386344,  0.11355495, -0.07590767,
       -0.06137546,  0.03473181, -0.09672217,  0.23404151, -0.05790887,
        0.05824998,  0.1189626 , -0.02155845,  0.08150762,  0.02354086,
       -0.01287421, -0.06634675, -0.01771472,  0.04614136,  0.04264734,
       -0.05899847, -0.14474605, -0.20472328,  0.11174335, -0.04687106,
       -0.09690262, -0.00388353, -0.13795663, -0.04058416,  0.17039439,
        0.0058929 , -0.0484461 ,  0.05597944, -0.11512263,  0.12882479,
        0.11971793,  0.02144642,  0.17163752,  0.1011178 , -0.07126617,
       -0.14121448,  0.21938412, -0.02309684, -0.01467476, -0.06499987,
        0.08942185, -0.0448062 , -0.14776629,  0.05837059, -0.08372281,
        0.03908674, -0.122391  , -0.02419479, -0.04998899, -0.13

get words that are highly correlated with this document ('d1')

In [20]:
'''get words that are highly correlated with this document ('d1')'''
sim.get_sim_bycol(query, num_best=10)

[('rentcorp', 0.862424373626709),
 ('mcgrath', 0.8584254384040833),
 ('earnings', 0.788344144821167),
 ('operating', 0.7494820356369019),
 ('operations', 0.746200442314148),
 ('results', 0.7455936074256897),
 ('assets', 0.742405891418457),
 ('dlrs', 0.739633321762085),
 ('name', 0.7382524013519287),
 ('income', 0.7381472587585449)]

get documents that are highly correlated with this document ('d1')

In [21]:
'''get documents that are highly correlated with this document ('d1')'''
sim.get_sim_byrow(query, num_best=10)

[('d1', 1.0),
 ('d7985', 0.7893538475036621),
 ('d9302', 0.7659999132156372),
 ('d7576', 0.7637244462966919),
 ('d4589', 0.7578975558280945),
 ('d2588', 0.7577046155929565),
 ('d8368', 0.757683515548706),
 ('d3341', 0.7568655014038086),
 ('d9165', 0.754281759262085),
 ('d7600', 0.753485381603241)]

In [22]:
' '.join([word_dic[ee] for ee in doc[sim.row_dic.token2id['d9302']]])

'cardinal industries inc said it restated earnings for the first nine months of 1986 to 235 000 dlrs or nine cts per share from 485 000 dlrs or 18 cts reported previously due to the failure of an automated accounting system installed in mid year and replaced in the fourth quarter the company said its reliance segment sustained a significant operating loss for the full year 1986 due to the accounting problems and increased promotional and advertising expenses for the full year it said it earned 284 000 dlrs or 10 cts a share up from 271 000 dlrs or 10 cts in 1985 cardinal industries said revenues for the year were 30 7 mln dlrs up from 23 0 mln dlrs in 1985 the company said for the first quarter of 1987 earnings were about 48 000 dlrs up from 13 000 dlrs or nil per share in the year ago period the year ago first quarter earnings however have been restated from 101 000 dlrs or four cts per share it said sales for the first quarter were about 8 363 000 dlrs up from 6 636 000 dlrs for the 