# DATA620: Week 10/11 Assignment

> #### Bryant Chang, Thomas Detzel, Sandipayan Nandi, and Erik Nylander

For this project, we use the Reuters Corpus ([Reuters 21578](http://disi.unitn.it/moschitti/corpora.htm)) data set. We especifically use the corpus included in the [NLTK.Corpus](http://www.nltk.org/book/ch02.html) package. It contains 10,788 news documents totalling 1.3 
million words. The documents have been classified into 90 topics, and grouped into two sets, called "training" and "test".

Categories in the Reuters Corpus overlap each other, because a news story often covers multiple topics. Our goal is to build a model with the training dataset in order to predict the class of new documents in the test dataset.

In [1]:
import nltk, re, pprint
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords
from collections import Counter
from itertools import *
import codecs
import urllib2
import sys
import os
import string
import random
import itertools as it
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import matplotlib.colors as colors
%matplotlib inline
plot.rcParams['figure.figsize'] = (21, 14)
from scipy.stats import rankdata

# Python 2 users only
from __future__ import division  
from __future__ import unicode_literals
from __future__ import print_function

# Load the corpus
from nltk.corpus import reuters

In [2]:
# Check the number of documents
len(reuters.fileids())

10788

In [3]:
# Check the number of words
len(reuters.words())

1720901

In [4]:
# Check the number of distinct words
len(set(reuters.words()))

41600

In [5]:
# Print first ten documents
reuters.fileids()[:10]

['test/14826',
 'test/14828',
 'test/14829',
 'test/14832',
 'test/14833',
 'test/14839',
 'test/14840',
 'test/14841',
 'test/14842',
 'test/14843']

In [6]:
# Print last ten documents
reuters.fileids()[10778:10788]

['training/9982',
 'training/9984',
 'training/9985',
 'training/9988',
 'training/9989',
 'training/999',
 'training/9992',
 'training/9993',
 'training/9994',
 'training/9995']

In [7]:
# Check the number of categories
len(reuters.categories())

90

In [8]:
# Print the first ten categories
reuters.categories()[:10]

[u'acq',
 u'alum',
 u'barley',
 u'bop',
 u'carcass',
 u'castor-oil',
 u'cocoa',
 u'coconut',
 u'coconut-oil',
 u'coffee']

In [9]:
# Create a document file, with each document as its category tag
documents = [(list(reuters.words(fileid)), category)
    for category in reuters.categories()
    for fileid in reuters.fileids(category)]

In [10]:
# Check first one
documents[0][0][:5], documents[0][1]

([u'SUMITOMO', u'BANK', u'AIMS', u'AT', u'QUICK'], u'acq')

----

### X.X create word_features from the text example

In [11]:
## word_features from text
all_words = nltk.FreqDist(w.lower() for w in reuters.words())
word_features = all_words.keys()[:2000]

In [12]:
# feature extractor: find whether a document contains any of the words in word_features
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [13]:
# test it - show first 10 
Counter(document_features('Reuters/training/cpu/0003746')).most_common(10)

[(u'contains(s)', True),
 (u'contains(corporate)', False),
 (u'contains(casse)', False),
 (u'contains(pressed)', False),
 (u'contains(jay)', False),
 (u'contains(pfr)', False),
 (u'contains(barred)', False),
 (u'contains(broaden)', False),
 (u'contains(workforces)', False),
 (u'contains(aberrational)', False)]

In [14]:
# create feature set
featuresets = [(document_features(d), c) for (d,c) in documents]

In [15]:
# define train and test sets
train_set, test_set = featuresets[100:], featuresets[:100]

In [16]:
# train classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [17]:
# test classifier
print("Accuracy is {}%.".format(nltk.classify.accuracy(classifier, test_set)*100))

Accuracy is 63.0%.


In [18]:
classifier.show_most_informative_features(5)

Most Informative Features
       contains(guilder) = True              dfl : acq    =   1324.2 : 1.0
       contains(heating) = True             heat : acq    =   1021.5 : 1.0
           contains(wet) = True           copra- : earn   =    991.2 : 1.0
       contains(opposed) = True           copra- : earn   =    991.2 : 1.0
      contains(shortage) = True           copra- : earn   =    991.2 : 1.0


----

### X.0 Refine the word_features to exclude stopwords

In [11]:
# remove stop words
r_words = [word.lower() for word in reuters.words() if word.isalpha()]
stopwords = stopwords.words('english')
r_words = [w for w in r_words if w not in stopwords]

In [12]:
# create list of 2,000 most common words
all_words = nltk.FreqDist(r_words)
word_features2 = all_words.most_common(2000)

# get just the words, no counts
word_features2 = [w for (w,v) in word_features2]

In [13]:
# update feature extractor with new features
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features2:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [14]:
# create feature set 2
featuresets2 = [(document_features(d), c) for (d,c) in documents]

In [15]:
# define train and test sets
train_set2, test_set2 = featuresets2[100:], featuresets2[:100]

In [16]:
# train classifier
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)

In [17]:
# test classifier
print("Accuracy is {}%.".format(nltk.classify.accuracy(classifier2, test_set2)*100))

Accuracy is 78.0%.


In [18]:
classifier2.show_most_informative_features(10)

Most Informative Features
          contains(palm) = True           palm-o : earn   =   2611.1 : 1.0
         contains(ounce) = True           pallad : earn   =   2312.9 : 1.0
    contains(economists) = True             rand : earn   =   2312.9 : 1.0
        contains(coffee) = True           coffee : earn   =   2256.3 : 1.0
        contains(rubber) = True           rubber : earn   =   2246.8 : 1.0
       contains(follows) = True           lin-oi : earn   =   2202.8 : 1.0
         contains(index) = True              lei : earn   =   1899.9 : 1.0
       contains(sorghum) = True           sorghu : earn   =   1850.3 : 1.0
    contains(seasonally) = True           housin : earn   =   1825.2 : 1.0
     contains(vegetable) = True           coconu : earn   =   1817.3 : 1.0
