In [1]:
from nltk.corpus import reuters
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
import nltk

import numpy as np
import os
import pickle

from collections import Counter
import copy
import re

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fei960922/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
labels = reuters.categories()
n_topic = len(labels)
print(n_topic, labels)

90 ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [4]:
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
print(len(test), len(train))

3019 7769


In [5]:
mlb = MultiLabelBinarizer()
raw_train = [reuters.raw(doc_id) for doc_id in train]
raw_test = [reuters.raw(doc_id) for doc_id in test]
raw = raw_train + raw_test
labels_train = mlb.fit_transform([reuters.categories(doc_id)
                                 for doc_id in train])
labels_test = mlb.transform([reuters.categories(doc_id)
                            for doc_id in test])
class_names = mlb.classes_

In [6]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'[a-zA-Z]{3,}')
porter_stemmer = nltk.stem.porter.PorterStemmer()  
stop_words = stopwords.words("english")
min_count = 3
vocab = Counter()
# build vocab
for line in raw_train:
    ll = tokenizer.tokenize(line)
    for word in ll:
        word = word.lower()
        word = porter_stemmer.stem(word)
        if word not in stop_words:
            vocab[word] += 1
original_vocab = copy.deepcopy(vocab)
for word in original_vocab.keys():
    if vocab[word] < min_count:
        del vocab[word]
# create ixtoword and wordtoix lists
ixtoword = {}
# period at the end of the sentence. make first dimension be end token
ixtoword[0] = 'END'
ixtoword[1] = 'UNK'
wordtoix = {}
wordtoix['END'] = 0
wordtoix['UNK'] = 1
ix = 2
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1
vocab = wordtoix
len(vocab)

8685

In [7]:
data_dir = "data/reuters"
VOCAB_URL = data_dir+'/vocab.pkl'
train_url = data_dir+'/train'
test_url = data_dir+'/test'
task_url = data_dir+'/task'

In [8]:
with open(VOCAB_URL, 'wb') as f:
    pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)

In [9]:
def encode_corpus_eng(vocab, corpus):
    import nltk
    encoded_corpus = []
    for line in corpus:
        encoded_sent = []
        ll = tokenizer.tokenize(line)
        for word in ll:
            word = word.lower()
            word = porter_stemmer.stem(word)
            if word in vocab:
                encoded_sent.append(vocab[word])
            else:
                encoded_sent.append(1)
        encoded_sent.append(0)
        encoded_corpus.append(np.array(encoded_sent))
    return np.array(encoded_corpus)

In [10]:
task_corpus = encode_corpus_eng(vocab, raw_test)
np.save(task_url+'.multi', np.array(task_corpus))
np.save(task_url+'.labels.multi', np.array([labels_test, class_names]))

In [11]:
data_withorder = encode_corpus_eng(vocab, raw_train)
sss = ShuffleSplit(1, test_size=0.2, random_state=42)
train_index, test_index = list(sss.split(data_withorder, labels_train))[0]

In [12]:
train_data = data_withorder[train_index]
test_data = data_withorder[test_index]
train_labels = labels_train[train_index]
test_labels = labels_train[test_index]
np.save(train_url + '.multi', train_data)
np.save(train_url + '.labels.multi', np.array([train_labels, class_names]))
np.save(test_url + '.multi', test_data)
np.save(test_url + '.labels.multi', np.array([test_labels, class_names]))    

In [17]:

np.save(train_url + '.multi', train_data)
np.load(train_url + '.multi.npy')

ValueError: Object arrays cannot be loaded when allow_pickle=False