In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from IPython.display import display

## Dataset
https://www.cs.umb.edu/~smimarog/textmining/datasets/

In [2]:
train = pd.read_csv('r8-train-all-terms.txt', header=None, sep='\t')
test = pd.read_csv('r8-test-all-terms.txt', header=None, sep='\t')

train.columns = ['label', 'content']
test.columns = ['label', 'content']

In [3]:
display(train.info())
display(train.head())
display(train.describe())

# Labels
display(train['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5485 entries, 0 to 5484
Data columns (total 2 columns):
label      5485 non-null object
content    5485 non-null object
dtypes: object(2)
memory usage: 85.8+ KB


None

Unnamed: 0,label,content
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


Unnamed: 0,label,content
count,5485,5485
unique,8,5423
top,earn,fed sets two billion dlr customer repurchase f...
freq,2840,3


earn        2840
acq         1596
crude        253
trade        251
money-fx     206
interest     190
ship         108
grain         41
Name: label, dtype: int64

In [4]:
display(test.info())
display(test.head())
display(test.describe())

# Labels
display(test['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2189 entries, 0 to 2188
Data columns (total 2 columns):
label      2189 non-null object
content    2189 non-null object
dtypes: object(2)
memory usage: 34.3+ KB


None

Unnamed: 0,label,content
0,trade,asian exporters fear damage from u s japan rif...
1,grain,china daily says vermin eat pct grain stocks a...
2,ship,australian foreign ship ban ends but nsw ports...
3,acq,sumitomo bank aims at quick recovery from merg...
4,earn,amatil proposes two for five bonus share issue...


Unnamed: 0,label,content
count,2189,2189
unique,8,2176
top,earn,fhlbb changes short term discount note rates t...
freq,1083,3


earn        1083
acq          696
crude        121
money-fx      87
interest      81
trade         75
ship          36
grain         10
Name: label, dtype: int64

## Glove

In [5]:
# Download dataset
! wget http://nlp.stanford.edu/data/glove.6B.zip -O glove.zip
! unzip glove.zip -d ./glove
! rm -rf glove.zip
! ls glove

--2018-11-04 00:15:01--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-11-04 00:15:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.zip’


2018-11-04 00:19:46 (2.90 MB/s) - ‘glove.zip’ saved [862182613/862182613]

Archive:  glove.zip
  inflating: ./glove/glove.6B.50d.txt  
  inflating: ./glove/glove.6B.100d.txt  
  inflating: ./glove/glove.6B.200d.txt  
  inflating: ./glove/glove.6B.300d.txt  
glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt


In [6]:
class GloveVectorizer:
    def __init__(self):
        word2vec = {}
        embedding = []
        idx2word = []
        with open('glove/glove.6B.50d.txt') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
        print('Found %s word vectors.' % len(word2vec))

        # save for later
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape


    def fit(self, data):
        pass


    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X


    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

## Feature Engineering
Glove Vectorizer

In [7]:
vectorizer = GloveVectorizer()

Found 400000 word vectors.


In [8]:
X_train = vectorizer.fit_transform(train['content'])
y_train = train['label']

display(train['content'].shape)
display(X_train.shape)
display(y_train.shape)

Numer of samples with no words found: 0 / 5485


(5485,)

(5485, 50)

(5485,)

In [9]:
X_test = vectorizer.fit_transform(test['content'])
y_test = test['label']

display(test['content'].shape)
display(X_test.shape)
display(y_test.shape)

Numer of samples with no words found: 0 / 2189


(2189,)

(2189, 50)

(2189,)

In [10]:
display(X_train[0])
display(y_train[0])

array([ 2.62985915e-01,  4.34498340e-02,  1.07159220e-01,  2.72644818e-01,
        2.89724290e-01,  1.09210342e-01, -4.20405596e-01, -2.63424784e-01,
       -1.30949691e-02, -1.06222428e-01,  1.33675948e-01, -3.83482501e-02,
       -2.41836533e-01, -2.28609309e-01,  2.99078017e-01,  1.69503644e-01,
       -2.04480574e-01, -5.62423840e-02, -4.67679679e-01, -3.28363597e-01,
        5.72856367e-01, -1.21237598e-01,  5.47765307e-02, -2.59782881e-01,
       -4.34731096e-01, -1.37025249e+00, -1.29700288e-01, -1.98638290e-01,
       -3.12043548e-01,  3.61148454e-03,  3.03834558e+00,  1.59490004e-01,
       -2.09241539e-01,  1.72576327e-02,  1.32028550e-01, -5.04550338e-01,
       -3.79321687e-02,  2.86173113e-02,  1.87277898e-01, -2.71893501e-01,
       -1.60743788e-01, -4.75056320e-02,  1.68933272e-01, -3.97280008e-02,
       -2.22825170e-01,  1.77603111e-01, -3.19965959e-01,  1.98276863e-01,
       -1.14674075e-03, -3.99776883e-02])

'earn'

In [11]:
display(X_test[5])
display(y_test[5])

array([ 0.00628271,  0.15836486,  0.79071522,  0.32796535,  0.04768872,
        0.01413339,  0.23070997, -0.57192236,  0.36161029,  0.14726688,
       -0.08490338,  0.00232739,  0.11899498, -0.70322466,  0.17920066,
        0.00483669, -0.24634524, -0.10109022, -0.33332342,  0.01012549,
        0.59175676, -0.45168051,  0.06631421, -0.14066973, -0.48459437,
       -0.42746809,  0.05004314, -0.39533252, -0.02435627,  0.00988586,
        1.82634044,  0.49018195,  0.32961842,  0.82186329,  0.06027789,
       -0.43832195, -0.20703138, -0.30820474,  0.41553509, -0.68371546,
        0.44538346, -0.14148799,  0.70002156, -0.33031902, -0.09758198,
        0.38192093,  0.07701898,  0.27802876,  0.3531909 ,  0.12002688])

'earn'

## Model

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
print("train score:", model.score(X_train, y_train))

train score: 0.9992707383773929


## Evaluate

In [14]:
print("test score:", model.score(X_test, y_test))

test score: 0.9333028780264961
