In [78]:
import operator
import re
from nltk.corpus import reuters, stopwords
import cPickle as pickle

In [77]:
ACQ = 'acq'
CORN = 'corn'
CRUDE = 'crude'
EARN = 'earn'

TRAIN = 'train'
TEST = 'test'

categories = [ACQ, CORN, CRUDE, EARN]
splits = [TRAIN, TEST]

train_freqs = {
    ACQ: 114,
    CORN: 38,
    CRUDE: 76,
    EARN: 152
}

test_freqs = {
    ACQ: 25,
    CORN: 10,
    CRUDE: 15,
    EARN: 40
}

In [22]:
print(sum(train_freqs.values()))

380


In [23]:
print(sum(test_freqs.values()))

90


In [25]:
total = sum(train_freqs.values()) + sum(test_freqs.values())
print(total)

470


In [9]:
acq_docs = [reuters.raw(fname) for fname in reuters.fileids('acq')]
print(len(acq_docs))

2369


In [11]:
corn_docs = [reuters.raw(fname) for fname in reuters.fileids('corn')]
print(len(corn_docs))

237


In [13]:
crude_docs = [reuters.raw(fname) for fname in reuters.fileids('crude')]
print(len(crude_docs))

578


In [15]:
earn_docs = [reuters.raw(fname) for fname in reuters.fileids('earn')]
print(len(earn_docs))

3964


In [92]:
def clean(raw_doc):
    doc = raw_doc.lower()
    doc = re.sub('[^a-z]', ' ', doc)
    doc = re.sub(' +', ' ', doc)
    word_list = doc.split(' ')
    word_list = [word for word in word_list if word not in stopwords.words('english')]
    clean_doc = ' '.join(word_list)
    return clean_doc.strip()

In [53]:
def frequency_count(doc_list):
    word_freqs = {}
    for doc in doc_list:
        clean_doc = clean(doc)
        for word in clean_doc.split():
            word = word.lower()
            if word in word_freqs:
                word_freqs[word] += 1
            else:
                word_freqs[word] = 0
    return sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

In [54]:
acq_word_freqs = frequency_count(acq_docs)

In [63]:
acq_word_freqs[:10]

[(u'said', 7156),
 (u'lt', 3895),
 (u'dlrs', 2874),
 (u'company', 2384),
 (u'the', 2310),
 (u'mln', 2301),
 (u'shares', 2078),
 (u'inc', 2076),
 (u'pct', 1984),
 (u'corp', 1650)]

In [56]:
corn_word_freqs = frequency_count(corn_docs)

In [62]:
corn_word_freqs[:10]

[(u'said', 696),
 (u'tonnes', 595),
 (u'corn', 575),
 (u'mln', 564),
 (u's', 483),
 (u'u', 482),
 (u'the', 400),
 (u'nil', 288),
 (u'pct', 288),
 (u'wheat', 260)]

In [58]:
crude_word_freqs = frequency_count(crude_docs)

In [64]:
crude_word_freqs[:10]

[(u'said', 2425),
 (u'oil', 2245),
 (u'mln', 1017),
 (u'the', 898),
 (u'dlrs', 793),
 (u's', 667),
 (u'u', 649),
 (u'crude', 644),
 (u'pct', 584),
 (u'prices', 577)]

In [65]:
earn_word_freqs = frequency_count(earn_docs)

In [66]:
earn_word_freqs[:10]

[(u'vs', 14140),
 (u'mln', 11506),
 (u'cts', 7949),
 (u'net', 6623),
 (u'dlrs', 6092),
 (u'loss', 4960),
 (u'shr', 4156),
 (u'lt', 4028),
 (u'said', 3450),
 (u'year', 3194)]

In [67]:
train_doc_fnames = [fname for fname in reuters.fileids() if fname.startswith('train')]
len(train_doc_fnames)

7769

In [69]:
test_doc_fnames = [fname for fname in reuters.fileids() if fname.startswith('test')]
len(test_doc_fnames)

3019

In [81]:
ds = {}

In [91]:
for s in splits:
    ds[s] = {}
    for c in categories:
        raw_docs = [reuters.raw(fn) for fn in reuters.fileids(c) if s in fn]
        for d in raw_docs:
            if len(d) < 30:
                print(d)
        ds[s][c] = raw_docs
        print(s, c, len(ds[s][c]))

16-MAR-1987
  16-MAR-1987


('train', 'acq', 1650)
('train', 'corn', 181)
29-MAR-1987
  29-MAR-1987


12-MAR-1987
  12-MAR-1987


('train', 'crude', 389)
30-MAR-1987
  30-MAR-1987


12-MAR-1987
  12-MAR-1987


12-MAR-1987
  12-MAR-1987


17-MAR-1987
  17-MAR-1987


18-MAR-1987
  18-MAR-1987


20-MAR-1987
  20-MAR-1987


20-MAR-1987
  20-MAR-1987


23-MAR-1987
  23-MAR-1987


25-MAR-1987
  25-MAR-1987


25-MAR-1987
  25-MAR-1987


('train', 'earn', 2877)
19-OCT-1987
  19-OCT-1987


('test', 'acq', 719)
('test', 'corn', 56)
19-OCT-1987
  19-OCT-1987


('test', 'crude', 189)
13-APR-1987
  13-APR-1987


13-APR-1987
  13-APR-1987


18-JUN-1987
  18-JUN-1987


20-OCT-1987
  20-OCT-1987


19-OCT-1987
  19-OCT-1987


19-OCT-1987
  19-OCT-1987


('test', 'earn', 1087)


In [88]:
pickle.dump(ds, open('modeapte_full_raw.pkl', 'wb'))