In [314]:
import string 
import nltk
import numpy as np
import pandas as pd
import cython

from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer as stemmer
from nltk.corpus import stopwords
import nltk.stem

from collections import Counter

nltk.download('wordnet')
nltk.download('stopwords')
stemmer = stemmer("english")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Table of Contents

This is broken down as
- Functions
- Timeit profiling
- Prun profiling
- Cython profiling

# Functions

In [296]:
def preprocess(doc):
    '''Function that lemmatizes words in abstract by verbs'''
    
    return [stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) 
            for w in doc.translate(str.maketrans('','', string.punctuation)).lower().split(' ')]


def rm_stopwords_and_short_words(words, st_words):
    '''Function removes stop words and those with length < 3'''
    results = []
    for i in words:
        if not i in st_words and len(i)  > 3:
            results.append(i)
    return results

def full_preprocess(doc, st_words):
    '''Performs word lemmatization and stopword removal'''
    return rm_stopwords_and_short_words(preprocess(doc), st_words)


def tf(docs, st_words):
    '''Term frequency matrix function, calculates the term frequencies of word from an text-document paired dictionary input. 
       The output is a term frequency table '''
    
    # generate counts per document
    counts = {k: Counter(full_preprocess(txt, st_words)) for k, txt in docs.items()}
    tf_df = pd.DataFrame.from_dict(counts).fillna(0).astype(int) # build pandas df, fill empty vals with 0s
    
    return(tf_df)


def token_filtering(tf_df):
    '''Filters out tokens that appear in fewer than 3 abstracts and tokens that appear in more than half the abstracts '''
    filtered_df = tf_df[(tf_df.sum(axis=1) > 3)]
    filtered_df = filtered_df[(filtered_df.astype(bool).sum(axis=1) / tf_df.shape[1] < 0.5)]
    
    return filtered_df
    
def get_docs(df):
    '''quickly build a dictionary based on filtered dataframe, get words w/ unique ids'''
    df.reset_index(inplace=True)
    filt_words = pd.DataFrame.to_dict(df.drop(columns='index'))
    
    return [[word for word, cnt in words.items() if cnt!=0] for dkeys, words in filt_words.items()]
    
    
def data_preproc(file_path):
    '''Data pre-processing function
       Input -> url to data in CSV format where each row is a document text'''
    
    df = pd.read_csv(file_path)
    
    in_docs = {k: str(txt[0]) for k,txt in enumerate(df.values)}
    
    st_words = stopwords.words('english')
    
    tf_df = tf(in_docs, st_words)
    
    filtered_df = token_filtering(tf_df)
    
    vocab = filtered_df.index.values
        
    docs = get_docs(filtered_df)
    
    return [vocab, docs]


# TimeIt profiling

In [305]:
%timeit -r2  data_preproc("tm_test_data.csv")

2.37 s ± 24.1 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)


# Prun profiling

In [312]:
# Get top 10 percent of what takes longest
%prun -l 0.05 -s cumtime data_preproc("tm_test_data.csv")

 

         7161593 function calls (7160832 primitive calls) in 3.516 seconds

   Ordered by: cumulative time
   List reduced from 646 to 32 due to restriction <0.05>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.554    3.554 {built-in method builtins.exec}
        1    0.004    0.004    3.554    3.554 <string>:1(<module>)
        1    0.017    0.017    3.550    3.550 <ipython-input-296-778cce2bb34d>:47(data_preproc)
        1    0.003    0.003    3.169    3.169 <ipython-input-296-778cce2bb34d>:21(tf)
        1    0.002    0.002    2.667    2.667 <ipython-input-296-778cce2bb34d>:26(<dictcomp>)
      622    0.002    0.000    2.652    0.004 <ipython-input-296-778cce2bb34d>:16(full_preprocess)
      622    0.002    0.000    2.472    0.004 <ipython-input-296-778cce2bb34d>:1(preprocess)
      622    0.101    0.000    2.454    0.004 <ipython-input-296-778cce2bb34d>:4(<listcomp>)
    82463    0.928    0.000    1.782    0.000 snowball.py

In [310]:
%prun -l preprocess -s cumtime data_preproc("tm_test_data.csv")

 

         7161593 function calls (7160832 primitive calls) in 3.496 seconds

   Ordered by: cumulative time
   List reduced from 646 to 3 due to restriction <'preprocess'>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      622    0.002    0.000    2.649    0.004 <ipython-input-296-778cce2bb34d>:16(full_preprocess)
      622    0.002    0.000    2.469    0.004 <ipython-input-296-778cce2bb34d>:1(preprocess)
        1    0.000    0.000    0.000    0.000 managers.py:1970(_preprocess_slice_or_indexer)

# Cython profiling


In [316]:
%load_ext cython

In [321]:
%%cython -a 

import string 
import nltk
import numpy as np
import pandas as pd
import cython

from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer as stemmer
from nltk.corpus import stopwords
import nltk.stem

from collections import Counter

nltk.download('wordnet')
nltk.download('stopwords')
stemmer = stemmer("english")


def preprocess(doc):
    '''Function that lemmatizes words in abstract by verbs'''
    
    return [stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) 
            for w in doc.translate(str.maketrans('','', string.punctuation)).lower().split(' ')]


def rm_stopwords_and_short_words(words, st_words):
    '''Function removes stop words and those with length < 3'''
    results = []
    for i in words:
        if not i in st_words and len(i)  > 3:
            results.append(i)
    return results

def full_preprocess(doc, st_words):
    '''Performs word lemmatization and stopword removal'''
    return rm_stopwords_and_short_words(preprocess(doc), st_words)
    


df = pd.read_csv("tm_test_data.csv")

in_docs = {k: str(txt[0]) for k,txt in enumerate(df.values)}

st_words = stopwords.words('english')

counts = {k: Counter(full_preprocess(txt, st_words)) for k, txt in in_docs.items()}

tf_df = pd.DataFrame.from_dict(counts).fillna(0).astype(int) 

filtered_df = tf_df[(tf_df.sum(axis=1) > 3)]

filtered_df = filtered_df[(filtered_df.astype(bool).sum(axis=1) / tf_df.shape[1] < 0.5)]

vocab = filtered_df.index.values

df.reset_index(inplace=True)
filt_words = pd.DataFrame.to_dict(df.drop(columns='index'))
    
docs = [[word for word, cnt in words.items() if cnt!=0] for dkeys, words in filt_words.items()]




[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
