In [1]:
import string 
import nltk
import numpy as np
import pandas as pd
import cython

from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer as stemmer
from nltk.corpus import stopwords
import nltk.stem

from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter

nltk.download('wordnet')
nltk.download('stopwords')
stemmer = stemmer("english")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ecoronado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocess(doc):
    '''Function that lemmatizes words in abstract by verbs'''
    
    return [stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) 
            for w in doc.translate(str.maketrans('','', string.punctuation)).lower().split(' ')]


def rm_stopwords_and_short_words(words, st_words):
    '''Function removes stop words and those with length < 3'''
    results = []
    for i in words:
        if not i in st_words and len(i)  > 3:
            results.append(i)
    return results

def full_preprocess(doc, st_words):
    '''Performs word lemmatization and stopword removal'''
    return rm_stopwords_and_short_words(preprocess(doc), st_words)


def tf(docs, st_words):
    '''Term frequency matrix function, calculates the term frequencies of word from an text-document paired dictionary input. 
       The output is a term frequency table '''
    
    # generate counts per document
    counts = {k: Counter(full_preprocess(txt, st_words)) for k, txt in docs.items()}
    tf_df = pd.DataFrame.from_dict(counts).fillna(0).astype(int) # build pandas df, fill empty vals with 0s
    
    return(tf_df)


def token_filtering(tf_df):
    '''Filters out tokens that appear in fewer than 3 abstracts and tokens that appear in more than half the abstracts '''
    filtered_df = tf_df[(tf_df.sum(axis=1) > 3)]
    filtered_df = filtered_df[(filtered_df.astype(bool).sum(axis=1) / tf_df.shape[1] < 0.5)]
    
    return filtered_df
    
def get_docs(df):
    '''quickly build a dictionary based on filtered dataframe, get words w/ unique ids'''
    df.reset_index(inplace=True)
    filt_words = pd.DataFrame.to_dict(df.drop(columns='index'))
    
    return [[word for word, cnt in words.items() if cnt!=0] for dkeys, words in filt_words.items()]
    
    
def data_preproc(file_path):
    '''Data pre-processing function
       Input -> url to data in CSV format where each row is a document text'''
    
    df = pd.read_csv(file_path)
    
    in_docs = {k: str(txt[0]) for k,txt in enumerate(df.values)}
    
    st_words = stopwords.words('english')
    
    tf_df = tf(in_docs, st_words)
    
    filtered_df = token_filtering(tf_df)
    
    vocab = filtered_df.index.values
        
    docs = get_docs(filtered_df)
    
    return [vocab, docs]


# TimeIt profiling

In [3]:
%timeit -r2  data_preproc("tm_test_data.csv")

2.37 s ± 58.5 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)


# Prun profiling

In [7]:
# Get top 10 percent of what takes longest
%prun -l 0.05 -s cumtime data_preproc("tm_test_data.csv")

 

         7090720 function calls (7089987 primitive calls) in 3.180 seconds

   Ordered by: cumulative time
   List reduced from 528 to 26 due to restriction <0.05>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.180    3.180 {built-in method builtins.exec}
        1    0.002    0.002    3.180    3.180 <string>:1(<module>)
        1    0.001    0.001    3.178    3.178 <ipython-input-6-3c5676dfe741>:47(data_preproc)
        1    0.003    0.003    3.148    3.148 <ipython-input-6-3c5676dfe741>:21(tf)
        1    0.002    0.002    2.645    2.645 <ipython-input-6-3c5676dfe741>:26(<dictcomp>)
      622    0.002    0.000    2.630    0.004 <ipython-input-6-3c5676dfe741>:16(full_preprocess)
      622    0.002    0.000    2.452    0.004 <ipython-input-6-3c5676dfe741>:1(preprocess)
      622    0.097    0.000    2.435    0.004 <ipython-input-6-3c5676dfe741>:4(<listcomp>)
    82463    0.912    0.000    1.770    0.000 snowball.py:1406(stem)
