In [1]:
#-----------------------------------------------------------------
# Description: Test for computing WK kernel
# Author: J Manttari
# Notes: clean_input_docs, doc_to_words and remove_nltk_stopwords 
#         can be disregarded if using only skilearn methods.
#-----------------------------------------------------------------
import pandas as pd       
from bs4 import BeautifulSoup             
import re
from nltk.corpus import stopwords # Import the stop word list
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import pickle

In [2]:
def clean_input_docs(raw_docs): 
    print "Cleaning and parsing the docs...\n"
    clean_docs = []
    num_docs = len(raw_docs)
    for i in xrange( 0, num_docs ):
        # Status message in case it takes a while
        if( (i+1)%1000 == 0 ):
            print "doc %d of %d\n" % ( i+1, num_docs )                                                                    
        clean_docs.append( doc_to_words( raw_docs[i] ))
    return clean_docs

In [3]:
def doc_to_words( raw_doc ):

    doc_text = BeautifulSoup(raw_doc).get_text() 
    
    # Remove non-letters     
    letters_only = re.sub("[^a-zA-Z]", " ", doc_text) 
    
    #Convert to lower case, split into individual words
    words = letters_only.lower().split()                             

    #nltk english stopwords are less picky than sci-kit. Which should we use?
    words = remove_nltk_stopwords(words)
    
    return( " ".join( words ))   

In [4]:
def remove_nltk_stopwords( words ):
    stops = set(stopwords.words("english"))  
    
    # Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    return meaningful_words

In [5]:
docs = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
num_docs = docs["review"].size

#clean input either with beautiful soup + regex 
#clean_docs = clean_input_docs(docs["review"][0:100])

# or trust sklearn to do it later
clean_docs = docs["review"][0:100] #slice of 100 is just to test without using to much computation

In [6]:
print "Creating the bag of words...\n"

#defaultanalyzer "word" removes non-chars in preprocessing and tokenizes words. does not remove "markup tokens"
#stop_words should be "english" if not using clean_input_docs()
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english",   \
                             max_features = 5000) #paper didn't say anything about max feature count? what should we have?

train_data_features = vectorizer.fit_transform(clean_docs)
train_data_features = train_data_features.toarray()
print train_data_features[0][0]
#print train_data_features
#print "idf..."
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(train_data_features)
tfidf = tfidf.toarray() 
#print tfidf
print "done"
print tfidf.shape

Creating the bag of words...

0
done
(100, 4176)


In [7]:
#Compute Kernel Matrix. This takes ~15 minutes for all 25 000 doc with 5 000 features
Kmat = np.ones((len(tfidf),len(tfidf)))

for i in xrange( 0, len(tfidf) ):
    if( (i+1)%100 == 0 ):
        print "row %d of %d\n" % ( i+1, num_docs )       
    for j in xrange(0,len(tfidf)):
        Kmat[i][j] = np.dot(tfidf[i], tfidf[j])



row 100 of 25000



In [8]:
#save the Kernel matrix for use later as it can take a while to compute.
with open('WKKernelMat.pickle', 'w') as f:  # Python 3: open(..., 'wb')
    pickle.dump(Kmat, f)