In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec
from gensim import utils

In [15]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [37]:
def process_doc(doc, top_start=8, short_block=60,
                strip_line_nos=True,
                reduce_times=True, reduce_dates=True,
                legal_abbr=True, min_sent_len=6):
    """
    Generic document processor.
    
    Args: doc -> str: A document expressed in a single
                 string
                 
          top_start -> int: The first carriage return-
                    delimited block of text to consider
                      
          short_block -> int: Length that blocks of text
                         in the document must be longer
                         than to be retained
                         
          min_sent_len -> int: Minimum sentence lenth in
                       characters
                              
          <flag> -> bool: A series of largely self-
                 explanitary Boolean flags
                 
    Returns -> str: Single string containing the processed text
    
    Notes: 1. A lot of this is probably getting done in subsequent
              calls to functions in the gensim library.
    
    TODO:
           DO: Review stop words and consider removing others that
               express negation.
               
           DO: Review how gensim processes text more closely. It
               is likely that the numbers I'm removing get cut
               during the creation of the embeddings.
               
           DO: Consider the effectiveness of the legal tokens.
               They frequently preserve trailing numbers.  This
               May be useful to preserve key citations, or it
               may confuse things.
               
           MAYBE: Tokenize dollar amounts as 'sumtoken'.
    """
    # Lowercase
    doc = doc.lower()
    
    # Break document into carriage return-delimited blocks
    blocks = doc.split('\n')[top_start:]
    long_blocks = [block for block in blocks if len(block) > short_block][:-1]
    text = ' '.join(long_blocks)
    
    # Remove a host of line and paragraph numbers.
    if strip_line_nos:
        text = re.sub("\[[0-9]\]", " ", text)
        text = re.sub("\[[0-9]\}", " ", text)
        text = re.sub("\[[0-9][0-9]\]", " ", text)
        text = re.sub(" [a-z]\) ", " ", text)
        text = re.sub(" [0-9]. ", " ", text)

    # Reduce times to a '[TIME]' token
    if reduce_times:
        text = re.sub("[0-9].[0-9][0-9]pm", "timetoken", text)
        text = re.sub("[0-9].[0-9][0-9]am", "timetoken", text)
        
    # Reduce dates to a '[DATE]' token
    if reduce_dates:
        text = re.sub(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)([\s]{1,3})?([0-9]{1,2})(.{1,3})?((,)|(.))?([\s]{1,3})?([0-9]{4})|([0-9]{1,2})(.{1,3})?([\s]{1,3})?(day)?([\s]{1,3})?(of)?([\s]{1,3})?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)((,)|(.))?(\s{1,3})?([0-9]{4})|(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first)([\s]{1,3})?(day)?([\s]{1,3})?(of)?([\s]{1,3})?(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(t)?(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)((,)|(.))?(\s{1,3})?([0-9]{4})|(\b[0-9]{1,2}(\-|\/)[0-9]{1,2}(\-|\/)[0-9]{2,4}\b)|(\b[0-9]{2,4}(\-|\/)[0-9]{1,2}(\-|\/)[0-9]{1,2}\b)', 'datetoken', text)

    # Handle some specific legal abbreviations
    if legal_abbr:
        text = re.sub("Cap\.", "capabbr", text)
        text = re.sub("Crim\.", "crimabbr", text)
        text = re.sub("App\.", "appabbr", text)
        text = re.sub("No\.", "number", text)
    
    # Get rid of tabs
    text = re.sub("\t", " ", text)

    # Reduce extraneous whitespace to a single space
    text = ' '.join(text.split())

    # Remove short headers and the like
    sentences = nltk.tokenize.sent_tokenize(text)
    sentences = [("startsent " + x + " endsent") for x in sentences if len(x) > min_sent_len]

    return ' '.join(sentences)


def process_corpus(corpus_path, doc_col='contents',
                   exclude_test=True, test_path=None,
                   rm_stopwords=True):
    """
    Applies `process_doc` to a corpus contained in a csv.
    
    Args: corpus_path -> str
    
          doc_col -> str: The column in the csv that
                  contains the documents
                 
    Returns -> list: List of processed documents
    """
    data = pd.read_csv(corpus_path)
    
    if exclude_test:
        print(len(data))
        test_data = pd.read_csv(test_path)
        print(len(test_data))
        data = data[~data.docid.isin(test_data.docid)]
        
    corpus = data[doc_col]
    print('Corpus length - ' + str(len(corpus)))
    
    # Remove stop words using NLTK's English list
    if rm_stopwords:
        documents = []
        stop_words = set(stopwords.words('english'))
        # Unfortunately, NLTK's list includes 'no'.
        # Exclude this to preserve negation.
        stop_words.remove('no')
        for doc in corpus:
            processed_doc = utils.simple_preprocess(process_doc(doc))
            keep_words = []
            for word in processed_doc:
                if word not in stop_words:
                    keep_words.append(word)
            if len(doc) > 0:
                documents.append(keep_words)
    else:
        documents = [utils.simple_preprocess(process_doc(doc)) for doc in corpus]
            
    return documents


def build_model(corpus, dims=300, workers=8):
    WV = Word2Vec(sentences=corpus, size=dims, workers=8)
    vocab = WV.wv.index2word
    vocab_len = len(vocab)
    embeddings = np.array([WV.wv.get_vector(word) for word in vocab])
    text_to_token = {word: i for word, i in zip(vocab, range(vocab_len))}
    token_to_text = {i: word for word, i in zip(vocab, range(vocab_len))}
    model = {'embeddings': embeddings,
             'text_to_token': text_to_token,
             'token_to_text': token_to_text,
             'vocab': vocab,
             'vocab_len': len(vocab)}
    return model

    
def tokenize_doc(doc, model):
    tokens = []
    for word in doc:
        try:
            tokens.append(model['text_to_token'][word])
        except:
            pass
    return(tokens)


def embed_tokens(doc, model):
    tokenized_doc = tokenize_doc(doc, model)
    embeddings = []
    for token in tokenized_doc:
        embeddings.append(model['embeddings'][token])
    return np.stack(embeddings)
    
    
def embed_doc(doc, model):
    embeddings = embed_tokens(doc, model)
    return embeddings.mean(axis=0)


def embed_corpus(corpus, model):
    embedded_corpus = [embed_doc(doc, model) for doc in corpus if len(doc) > 0]
    return np.stack(embedded_corpus)

In [16]:
%%time
corpus = process_corpus(corpus_path='../data/ICAAD_FIJI.csv', test_path='../data/test.csv',
                        doc_col='contents')

13384
162
Corpus length - 13292
CPU times: user 3min 46s, sys: 780 ms, total: 3min 46s
Wall time: 3min 47s


In [38]:
foo = build_model(corpus, 300, 8)

In [40]:
foo['embeddings'].shape

(33300, 300)

In [42]:
foo['text_to_token']['bread']

4861

In [44]:
bread = foo['embeddings'][4861,:]

In [45]:
bread

array([-0.4425457 ,  0.16182797, -1.0246683 ,  0.631806  , -0.8348473 ,
       -0.46839136,  0.5780923 , -0.20096205, -0.46400702,  0.2968429 ,
       -0.4325057 ,  0.6767891 , -0.03729293,  0.9588748 , -0.0384476 ,
       -0.01007176,  0.33341667, -1.7993168 , -1.8348106 ,  0.5617358 ,
       -0.24403776, -0.01262853,  0.33668387, -0.08049419,  0.04921189,
       -1.7788202 , -0.73616195,  0.24110165,  0.81963897,  0.7799278 ,
       -0.66614276,  0.16142029,  0.38969666, -0.27847752,  0.26986897,
        0.33329672,  1.2911944 , -1.0098555 ,  0.4762035 ,  0.1025757 ,
        0.29415452, -0.64394087, -0.29753718,  0.04163301, -0.295754  ,
        0.1606689 , -0.6477798 ,  0.31304285, -0.25657564, -0.9719063 ,
       -0.31546354, -0.5486953 , -0.28487316, -0.06243191,  0.16549812,
        0.12494312, -0.03370988,  0.15589952,  0.02093038, -0.27809498,
        0.1289665 , -0.8324646 ,  0.3968013 , -0.16357818,  0.26121345,
       -0.68512166, -0.31802827, -0.28400838,  0.17055993,  0.36

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
cosine_similarity(bread.reshape(1,-1), foo['embeddings']).argsort()

array([[1530, 2229, 5881, ..., 5068, 6335, 4861]])

In [65]:
foo['token_to_text'][5068]

'breadwinner'

In [53]:
foo.shape

AttributeError: 'dict' object has no attribute 'shape'

In [55]:
type(foo['embeddings'])

numpy.ndarray

In [60]:
bread.reshape(-1,1).shape

(300, 1)