preparation sample program using Reuters newswire topics.

In [1]:
import os.path
import sys
import re
import itertools
import csv
import datetime
import pickle
import random
from collections import defaultdict, Counter

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import gensim
from sklearn.metrics import f1_score, classification_report, confusion_matrix, log_loss
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### load dataset Reuters newswire topics

In [2]:
from keras.datasets import reuters

### create word_dic

In [3]:
word_index = reuters.get_word_index()

Reuters' corpus is coded from 4.

In [4]:
word_index2 = dict([(k, v+3) for k, v in word_index.items()])
word_dic = gensim.corpora.Dictionary([['<padding>', '<start_char>', '<oov_char>'],], prune_at=None)
word_dic.token2id.update(word_index2)

In [5]:
word_dic[27595]

'mcgrath'

In [6]:
word_dic[4]

'the'

In [7]:
word_dic[0]

'<padding>'

In [8]:
'''
an error will occur
'''
word_dic[3]

KeyError: 3

### load corpus

In [9]:
(doc, cat), (doc_test, cat_test) = reuters.load_data(test_split=0.0, start_char=None)

In [10]:
' '.join([word_dic[ee] for ee in doc[0]])

'mcgrath rentcorp said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3'

### create doc_seq
doc_seq is just a sequence that returns a list of word.

In [12]:
doc_seq = [[word_dic[ee] for ee in doc[ii]] for ii in range(len(doc))]

In [13]:
len(doc_seq)

11228

In [14]:
doc_seq[0]

['mcgrath',
 'rentcorp',
 'said',
 'as',
 'a',
 'result',
 'of',
 'its',
 'december',
 'acquisition',
 'of',
 'space',
 'co',
 'it',
 'expects',
 'earnings',
 'per',
 'share',
 'in',
 '1987',
 'of',
 '1',
 '15',
 'to',
 '1',
 '30',
 'dlrs',
 'per',
 'share',
 'up',
 'from',
 '70',
 'cts',
 'in',
 '1986',
 'the',
 'company',
 'said',
 'pretax',
 'net',
 'should',
 'rise',
 'to',
 'nine',
 'to',
 '10',
 'mln',
 'dlrs',
 'from',
 'six',
 'mln',
 'dlrs',
 'in',
 '1986',
 'and',
 'rental',
 'operation',
 'revenues',
 'to',
 '19',
 'to',
 '22',
 'mln',
 'dlrs',
 'from',
 '12',
 '5',
 'mln',
 'dlrs',
 'it',
 'said',
 'cash',
 'flow',
 'per',
 'share',
 'this',
 'year',
 'should',
 'be',
 '2',
 '50',
 'to',
 'three',
 'dlrs',
 'reuter',
 '3']

### create doc_dic

In [15]:
doc_dic = gensim.corpora.Dictionary(prune_at=None)
doc_dic.token2id.update(dict([('d'+str(ee+1), ee) for ee in range(len(doc))]))

In [16]:
list(doc_dic.token2id.keys())[:5]

['d1', 'd2', 'd3', 'd4', 'd5']

doc_seq is referenced as follows.

In [17]:
doc_seq[doc_dic.token2id['d1']]

['mcgrath',
 'rentcorp',
 'said',
 'as',
 'a',
 'result',
 'of',
 'its',
 'december',
 'acquisition',
 'of',
 'space',
 'co',
 'it',
 'expects',
 'earnings',
 'per',
 'share',
 'in',
 '1987',
 'of',
 '1',
 '15',
 'to',
 '1',
 '30',
 'dlrs',
 'per',
 'share',
 'up',
 'from',
 '70',
 'cts',
 'in',
 '1986',
 'the',
 'company',
 'said',
 'pretax',
 'net',
 'should',
 'rise',
 'to',
 'nine',
 'to',
 '10',
 'mln',
 'dlrs',
 'from',
 'six',
 'mln',
 'dlrs',
 'in',
 '1986',
 'and',
 'rental',
 'operation',
 'revenues',
 'to',
 '19',
 'to',
 '22',
 'mln',
 'dlrs',
 'from',
 '12',
 '5',
 'mln',
 'dlrs',
 'it',
 'said',
 'cash',
 'flow',
 'per',
 'share',
 'this',
 'year',
 'should',
 'be',
 '2',
 '50',
 'to',
 'three',
 'dlrs',
 'reuter',
 '3']

Create an instance of WordAndDoc2vec using doc_seq, word_dic and doc_dic.

```python
wd2v = WordAndDoc2vec(doc_seq, word_dic, doc_dic, logging=False)
wd2v
```