In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir( os.path.join( '..', 'notebook_format' ) )
from formats import load_style
load_style(css_style = 'custom2.css')

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size
plt.rcParams['font.size'] = 12 # and font size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,scikit-learn

Ethen 2016-10-20 17:06:42 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
matplotlib 1.5.1
scikit-learn 0.17.1


# LSA

In [5]:
import pickle

FILEPATH = os.path.join('data', 'raw_text_dataset.pickle')
raw_text_dataset = pickle.load( open( FILEPATH, 'rb' ) )
X_train_raw = raw_text_dataset[0]
y_train_labels = raw_text_dataset[1] 
X_test_raw = raw_text_dataset[2]
y_test_labels = raw_text_dataset[3]

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

In [26]:
# convert raw documents to tfidf feature representation;
# - strips out a list of english stop words;
# - filters out terms that occur in more than half of the docs (max_df=0.5)
# - filters out terms that occur in only one document (min_df=2).
# - selects the 10,000 most frequently occuring words in the corpus
tfidf = TfidfVectorizer(max_df = 0.5, max_features = 10000,
                        min_df = 2, stop_words = 'english')

X_train_tfidf = tfidf.fit_transform(X_train_raw)
print(X_train_tfidf.shape)

(4743, 10000)


After converting the raw documents to tfidf feature representation, we then project the tfidf vectors onto the first N (a hyperparameter that we have to select) principal components. Although the feature's dimensionality is now significantly fewer than the original tfidf vector, they are considered to be more representative features. After reducing the dimensionality, we'll once again perform a normalization to scale the features to unit norms.

In [40]:
# truncated SVD works on term count/tf-idf matrices,
# it is known as latent semantic analysis (LSA)
svd = TruncatedSVD(n_components = 100, n_iter = 15)
lsa = Pipeline([ ('svd', svd), ('normalize', Normalizer(copy = False)) ])

# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)
print(X_train_lsa.shape)

(4743, 100)


In [47]:
svd.components_

array([[  5.16868641e-03,   2.67730482e-01,   2.19358567e-03, ...,
          1.35045340e-04,   2.48934912e-04,   7.16342505e-04],
       [  8.76705946e-03,  -2.05275420e-02,  -5.15322100e-04, ...,
          1.02237316e-03,   1.45021891e-03,   1.04164490e-03],
       [ -7.74525340e-04,  -1.63326764e-01,  -1.90116397e-03, ...,
          6.46334094e-06,  -8.38084996e-05,  -3.44853310e-04],
       ..., 
       [ -1.36943253e-02,   9.30230717e-03,   4.15289349e-03, ...,
          2.80562069e-04,   7.55294373e-04,   2.88838501e-03],
       [ -8.18409277e-02,   2.41325465e-02,   6.83291688e-03, ...,
         -1.44290799e-04,   4.84493144e-04,  -1.59021975e-03],
       [  3.25538715e-02,   3.13933199e-02,   3.95766245e-03, ...,
          7.84372647e-05,  -3.95042967e-04,  -9.91719990e-04]])

In [46]:
lsa.named_steps['svd'].components_.shape

(100, 10000)

In [36]:
lsa.explained_variance_ratio_

array([ 0.03204465,  0.01094008,  0.01495829,  0.00791216,  0.00659883,
        0.00630385,  0.00625253,  0.00564834,  0.0048622 ,  0.0045222 ,
        0.0045885 ,  0.00409799,  0.00381508,  0.00313612,  0.00262854,
        0.00232603,  0.00215106,  0.00200291,  0.00192625,  0.00186778,
        0.00179161,  0.00164662,  0.00158817,  0.00147094,  0.00143266,
        0.00140267,  0.00124479,  0.00122631,  0.00120548,  0.00116956,
        0.00112467,  0.00108339,  0.00105352,  0.00101777,  0.00099316,
        0.00096049,  0.00094521,  0.00093184,  0.00091132,  0.00089602,
        0.00088391,  0.00085753,  0.00085188,  0.00082159,  0.00080896,
        0.00079958,  0.00077914,  0.00076902,  0.00076393,  0.00075293,
        0.00074145,  0.00072395,  0.00070824,  0.00069893,  0.00069274,
        0.00067801,  0.00066727,  0.00065434,  0.00064619,  0.00064026,
        0.00063137,  0.00062124,  0.00061347,  0.00059923,  0.00058311,
        0.00057426,  0.00055699,  0.00055297,  0.00054891,  0.00

In [34]:
lsa.components_.shape

(100, 10000)

In [None]:
for compNum in range(0, 10):

    comp = svd.components_[compNum]
    
    # Sort the weights in the first component, and get the indeces
    indeces = numpy.argsort(comp).tolist()
    
    # Reverse the indeces, so we have the largest weights first.
    indeces.reverse()
    
    # Grab the top 10 terms which have the highest weight in this component.        
    terms = [feat_names[weightIndex] for weightIndex in indeces[0:10]]    
    weights = [comp[weightIndex] for weightIndex in indeces[0:10]]    
    
    # Display these terms and their weights as a horizontal bar graph.    
    # The horizontal bar graph displays the first item on the bottom; reverse
    # the order of the terms so the biggest one is on top.
    terms.reverse()
    weights.reverse()
    positions = arange(10) + .5    # the bar centers on the y axis
    
    figure(compNum)
    barh(positions, weights, align='center')
    yticks(positions, terms)
    xlabel('Weight')
    title('Strongest terms for component %d' % (compNum))
    grid(True)
    show()