In [88]:
import pickle_util
import nltk
import re
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

%load_ext autoreload
%autoreload 2
%pylab inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


In [122]:
# config
g_name = 'dev'
# g_name = 'train'

In [123]:
# load data
f_name = g_name
data = pickle_util.load_obj(f_name)
docs = data['hits']['hits']

In [124]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.SnowballStemmer('english')

In [125]:
# http://brandonrose.org/clustering

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [126]:
# create a vocabulary
totalvocab_stemmed = []
totalvocab_tokenized = []

if_bool = True

for d in docs:
    
    allwords_stemmed = tokenize_and_stem(d['_source']['content'])
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(d['_source']['content'])
    totalvocab_tokenized.extend(allwords_tokenized)
    
vocab = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
print "vocab len: " + str(vocab.shape[0])

vocab len: 37685


In [127]:
# tf.idf article content
docs_content = [doc['_source']['content'] for doc in docs]
content_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, max_features=200000, use_idf=True,
                            min_df=0.2, tokenizer=tokenize_and_stem, ngram_range=(1,3))
X_content = content_vectorizer.fit_transform(docs_content)
terms_content = content_vectorizer.get_feature_names()
print X_content.shape

(100, 40)


In [128]:
# tf.idf article signal-entities (surface-form)
docs_entities = []
for d in docs:
    try:
        entities = d['_source']['signal-entities']
        d['_source']['signal-entities-text'] = " ".join([entity['surface-form'] for entity in entities])
    except Exception:
        print "except", Exception
#         d['_source']['signal-entities'] = []
        d['_source']['signal-entities-text'] = ""
        
    docs_entities.append(d['_source']['signal-entities-text'])
    
entity_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,2))
X_entities = entity_vectorizer.fit_transform(docs_entities)
terms_entities = entity_vectorizer.get_feature_names()
print X_entities.shape

(100, 5082)


In [129]:
# save objects: vocab, X_content (tf.idf for content)
# X_entity (tf.idf for entities), ..
pickle_util.save_obj(vocab, 'vocab_'+g_name)
pickle_util.save_obj(X_content, 'X_content_'+g_name)
pickle_util.save_obj(X_entities, 'X_entities_'+g_name)
pickle_util.save_obj(terms_content, 'terms_content_'+g_name)
pickle_util.save_obj(terms_entities, 'terms_entities_'+g_name)