In [2]:
import numpy as np
import cppimport

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer as stemmer
from nltk.corpus import stopwords
import nltk.stem

nltk.download('wordnet')
nltk.download('stopwords')
stemmer = stemmer("english")

## Import C++ functions
preproc_cpp = cppimport.imp("hdp_preproc")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Functions

In [18]:
def preprocess(doc):
    '''Function that lemmatizes words in abstract by verbs'''
    
    return [stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) 
            for w in preproc_cpp.text_cleanup(doc[0])]

def full_preprocess(doc, st_words):
    '''Performs word lemmatization and stopword removal'''
    return preproc_cpp.rm_stops_n_shorts(preprocess(doc), st_words, 3)


def tf(in_docs):
    '''Term frequency matrix function, calculates the term frequencies of word from an 
       a list of documents text. Then filtered according to frequency criteria to keep shared 
       yet low occurence words.
       The output is the filtered term frequency table and associated vocabulary'''
    
    v = preproc_cpp.generate_vocab(in_docs) # generates vocab
    tf = preproc_cpp.tf_cpp(in_docs, v) # generates tf matrix
    
    filt_df = preproc_cpp.filter_tf_cpp(tf) # filters tf matrix (last column is vocab indexes)
    
    # Filter vocab indexes
    v_idx = filt_df[:, filt_df.shape[1]-1].astype(int) # get vocab indexes
    vocab = np.array(v)[v_idx]
    
    return vocab, filt_df[:,1:filt_df.shape[1]]



def get_docs(df):
    '''Get list of sublists (len = documents), with each sublist containing unique word ids per document'''
    
    return preproc_cpp.get_docs(df)

    
def data_preproc(file_path):
    '''Data pre-processing function
       Input -> url to data in CSV format where each row is a document text'''
    
    df = pd.read_csv(file_path)
        
    st_words = set(stopwords.words('english'))
    
    in_docs = [full_preprocess(d, st_words) for d in df.values]
    
    vocab, filtered_df = tf(in_docs)
            
    docs = get_docs(filtered_df)
    
    return vocab, docs


# TimeIt profiling

In [None]:
df = pd.read_csv("tm_test_data.csv")
    
in_docs = {k: str(txt[0]) for k,txt in enumerate(df.values)}

In [None]:
%timeit -r2  data_preproc("tm_test_data.csv")

In [19]:
vocab, docs = data_preproc("tm_test_data.csv")

[59,
 104,
 269,
 380,
 388,
 396,
 488,
 547,
 592,
 617,
 637,
 675,
 729,
 811,
 854,
 957,
 978,
 988,
 1045,
 1111,
 1275,
 1309,
 1321,
 1487,
 1501,
 1544]

# Prun profiling

In [None]:
# Get top 10 percent of what takes longest
%prun -l 0.05 -s cumtime data_preproc("tm_test_data.csv")

In [None]:
import cppimport


preproc_cpp = cppimport.imp("hdp_preproc")



In [4]:
df = pd.read_csv("tm_test_data.csv")
    
#in_docs = {k: str(txt[0]) for k,txt in enumerate(df.values)}
st_words = set(stopwords.words('english'))

In [None]:
test = [stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) 
            for w in preproc_cpp.text_cleanup(df.values[0][0])]

In [None]:
preproc_cpp.rm_stops_n_shorts(test, st_words)

In [5]:
in_docs = [full_preprocess(d, st_words) for d in df.values]

In [6]:
v = preproc_cpp.generate_vocab(in_docs)

In [7]:
tf = preproc_cpp.tf_cpp(in_docs, v)

In [8]:
tf.shape

(4714, 622)

In [9]:
filt_df = preproc_cpp.filter_tf_cpp(tf)

In [10]:
filt_df.shape

(1555, 623)

In [11]:
vocab = np.array(v)[filt_df[:,622].astype(int)]

In [17]:
test = preproc_cpp.get_docs(filt_df)

In [16]:
vocab[test[0]]

array(['algorithm', 'analyz', 'avail', 'brief', 'code', 'comparison',
       'current', 'dataset', 'demonstr', 'dialog', 'discuss', 'featur',
       'final', 'high', 'homogen', 'interact', 'introduc', 'introduct',
       'larg', 'lisp', 'main', 'make', 'miss', 'modif', 'object', 'order',
       'orient', 'overview', 'paper', 'perform', 'plot', 'presenc',
       'present', 'produc', 'program', 'small', 'techniqu', 'userfriend',
       'well'], dtype='<U38')