In [2]:
import numpy as np
import re
import cPickle as pickle
from nltk.corpus import reuters, stopwords

In [2]:
TRAIN = 'train'
TEST = 'test'
SPLITS = [TRAIN, TEST]

In [3]:
ACQ = 'acq'
CORN = 'corn'
CRUDE = 'crude'
EARN = 'earn'
CLASSES = [ACQ, CORN, CRUDE, EARN]

In [4]:
sizes = {
    TRAIN: {
        EARN: 152,
        ACQ: 114,
        CRUDE: 76,
        CORN: 38
    },
    TEST: {
        EARN: 40,
        ACQ: 25,
        CRUDE: 15,
        CORN: 10
    }
}

In [48]:
total = 152 + 114 + 76 + 38 + 40 + 25 + 15 + 10

In [58]:
earn_num = 152 + 40
acq_num = 114 + 25

In [57]:
earn_num / float(total)

0.4085106382978723

In [59]:
acq_num / float(total)

0.2957446808510638

In [53]:
(38 + 10) / float(total)

0.10212765957446808

In [5]:
labels = {
    ACQ: 0,
    CORN: 1,
    CRUDE: 2,
    EARN: 3,
}

In [6]:
def clean(doc):
    doc = re.sub('[^a-zA-Z]', ' ', doc)
    doc = re.sub(' +', ' ', doc)
    doc = ' '.join([w for w in doc.split() if w not in stopwords.words('english')])
    return doc.lower().strip()

In [7]:
data = {}
for split in SPLITS:
    data[split] = []
    for cls in CLASSES:
        file_list = [clean(reuters.raw(d)) for d in reuters.fileids(cls) if d.startswith(split)]
        # disregard documents that are too short
        long_list = [d for d in file_list if len(d) > 15]
        # sort remaining documents by length
        #sort_list = sorted(long_list, key=len)
        # split off however many we need
        final_list = long_list[:sizes[split][cls]]
        for doc in final_list:
            data[split].append((doc, cls))
        print(split, cls, len(final_list))

('train', 'acq', 114)
('train', 'corn', 38)
('train', 'crude', 76)
('train', 'earn', 152)
('test', 'acq', 25)
('test', 'corn', 10)
('test', 'crude', 15)
('test', 'earn', 40)


In [8]:
pickle.dump(data, open('reuters.pkl', 'wb'))

In [71]:
ds = pickle.load(open('../dataset/reuters.pkl', 'rb'))

In [74]:
ds['train'][0]

(u'computer terminal systems lt cpml completes sale computer terminal systems inc said completed sale shares common stock warrants acquire additional one mln shares lt sedio n v lugano switzerland dlrs the company said warrants exercisable five years purchase price dlrs per share computer terminal said sedio also right buy additional shares increase total holdings pct computer terminal outstanding common stock certain circumstances involving change control company the company said conditions occur warrants would exercisable price equal pct common stock market price time exceed dlrs per share computer terminal also said sold technolgy rights dot matrix impact technology including future improvements lt woodco inc houston tex dlrs but said would continue exclusive worldwide licensee technology woodco the company said moves part reorganization plan would help pay current operation costs ensure product delivery computer terminal makes computer generated labels forms tags ticket printers te

In [76]:
ds['test'][0]

(u'sumitomo bank aims at quick recovery from merger sumitomo bank ltd lt sumi t certain lose status japan profitable bank result merger heiwa sogo bank financial analysts said osaka based sumitomo desposits around trillion yen merged heiwa sogo small struggling bank estimated billion dlrs unrecoverable loans october but despite link sumitomo president koh komatsu told reuters confident bank quickly regain position we back position first place within three years komatsu said interview he said merger initially reduce sumitomo profitability efficiency vastly expand sumitomo branch network tokyo metropolitan area relatively weak but financial analysts divided whether quickly gamble pay some said sumitomo may paid much heiwa sogo view smaller bank large debts others argue merger cost effective creating comparable branch network scratch the analysts agreed bank aggressive it expanded overseas entered lucrative securities business geared domestic competition questioned wisdom moves they made 

In [81]:
for i, d in enumerate(ds['train']):
    for j, f in enumerate(ds['train']):
        if i != j and d == f:
            print('prob')
            print(d)
            print(f)
            print(i, j)
    

prob
(u'mexican first qtr crude exports seen at dlrs the average price mexico crude oil exports first quarter dlrs per barrel according preliminary figures issued press release state oil company petroleos mexicanos pemex it gave direct comparison year ago figure said crude products sales expected rise billion dlrs quarter mln dlrs higher expected pct better year ago quarter prospects second quarter relatively favourable crude exports seen mln bpd expected mln month february mln january', 'crude')
(u'mexican first qtr crude exports seen at dlrs the average price mexico crude oil exports first quarter dlrs per barrel according preliminary figures issued press release state oil company petroleos mexicanos pemex it gave direct comparison year ago figure said crude products sales expected rise billion dlrs quarter mln dlrs higher expected pct better year ago quarter prospects second quarter relatively favourable crude exports seen mln bpd expected mln month february mln january', 'crude')
(

In [7]:
ds['train'][1]

(u'national amusements again ups viacom lt via bid viacom international inc said lt national amusements inc raised value offer viacom publicly held stock the company said special committee board plans meet later today consider offer one submitted march one lt mcv holdings inc a spokeswoman unable say committee met planned yesterday viacom said national amusements arsenal holdings inc subsidiary raised amount cash offering viacom share cts dlrs value fraction share exchangeable arsenal holdings preferred included raised cts dlrs national amusements already owns pct viacom stock',
 'acq')

In [26]:
categories = ['acq', 'corn', 'crude', 'earn']

In [27]:
documents = {}
for c in categories:
    documents[c] = [reuters.raw(d) for d in reuters.fileids(c)]

In [41]:
len(documents['acq']) / float(total)

0.33142137660884163

In [42]:
len(documents['corn'])  / float(total)

0.03315612758813654

In [43]:
len(documents['crude'])  / float(total)

0.0808617795187465

In [44]:
len(documents['earn'])  / float(total)

0.5545607162842753

In [34]:
total = sum([len(documents[c]) for c in categories])

In [40]:
total

7148

In [67]:
flat_docs = documents.values()[0] + documents.values()[1] + documents.values()[2] + documents.values()[3]

In [68]:
len(flat_docs)

7148

In [70]:
for i, d1 in enumerate(flat_docs):
    for j, d2 in enumerate(flat_docs):
        if i != j and d1 == d2:
            print('problem')
                
    

problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem
problem


KeyboardInterrupt: 