# Prepare Prediction Inputs

prepare input

In [1]:
import warnings
import itertools

warnings.filterwarnings(action='ignore')

In [2]:
import pandas as pd
import numpy as np
import re
from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec, KeyedVectors, Phrases
from gensim.parsing.preprocessing import strip_short,strip_punctuation,\
                                         strip_numeric, strip_multiple_whitespaces

import glob
from nltk import tokenize
from nltk.corpus import stopwords
import inflect

In [3]:
def docs_to_index(file_path):
    articles = []
    labels = []
    for i in glob.glob(file_path + '/*.txt'):
        try:
            paper = open(i, encoding='utf-8')
            articles.append(paper.read())
            labels.append(i.split('/')[-1].split('.')[0][5:])
        except:
            pass
    # Clear out newline characters and non-unicode characters, concatenate words separated with '- '
    cleaned_articles = list(map(lambda x:x.lower(), articles))
    cleaned_articles = list(map(lambda x: re.sub(r"[^a-zA-Z0-9()_-]", ' ', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r"- ", "", x), cleaned_articles))
    # strip contents between brackets
    cleaned_articles = list(map(lambda x: re.sub("[\(\[].*?[\)\]]", "", x), cleaned_articles))
    # strip the words start with x
    cleaned_articles = list(map(lambda x: re.sub(r'\bx.*?\b', '', x), cleaned_articles))
    # strip the words start with y, not followed by a vow
    cleaned_articles = list(map(lambda x: re.sub(r'\by[^aeiou].*?\b', '', x), cleaned_articles))
    # remove words that contains digits
    cleaned_articles = list(map(lambda x: re.sub(r'\w*\d\w*\s*', '', x), cleaned_articles))
    # remove 'max', 'min', 'sup', 'lim', 'exp', 'eqz'
    cleaned_articles = list(map(lambda x: re.sub(r'\bmax\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\bmin\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\bsup\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\blim\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\bexp\b\s*', '', x), cleaned_articles))
    cleaned_articles = list(map(lambda x: re.sub(r'\beqz\b\s*', '', x), cleaned_articles))
    # remove consecutive duplicate
    cleaned_articles = list(map(lambda x: re.sub(r'\b(\w+)\s+\1\b\s*', '', x), cleaned_articles))
    # strip references section
    cleaned_articles = list(map(lambda x: ''.join(x.split('reference')[:-1])
                                if x.find('reference') != -1 else x, cleaned_articles))
    # Strip out characters that are less than 3
    def preprocess_text(s):
        """Remove unwanted text formats with numeric, whitespace, punctuation, short words stripped 
           Input: text string
           Output: post processed string
        """
        s = strip_multiple_whitespaces(s)
        s = strip_punctuation(s)
        s = strip_short(s, minsize = 3)
        regex = re.compile('[^\w]')
        regex.sub('', s)
        return s
    cleaned_articles = list(map(preprocess_text, cleaned_articles))
    cleaned_sentences = []
    for i in cleaned_articles:
        cleaned_sentences += list(map(lambda x: x, tokenize.sent_tokenize(i)))
    
    stop_words = set(stopwords.words('english') + ['within', 'however']) 
    # Strip stopwords, tokenize sentence to words
    cleaned_sentences_w = list(map(lambda sentence: [w for w in tokenize.word_tokenize(sentence) if not w in stop_words], 
                              cleaned_sentences))
    # bigram transform
    bigram_transformer = Phrases(cleaned_sentences_w)
    return list(bigram_transformer[cleaned_sentences_w]), labels

In [4]:
papers_word_lists, papers_labels = docs_to_index('./extracted_papers/test/')

map it to vocab to get index

In [5]:
filename = "./word vectors.kv"
model = KeyedVectors.load(filename, mmap='r')
word_embedding = np.array(model.wv.vectors)
vocab = list(model.wv.vocab)

In [6]:
papers_word_index_list_with_undefined = list(map(lambda x: list(map(lambda y: vocab.index(y)+1 if y in vocab else -1, x)),
                                                 papers_word_lists))
papers_word_index_list_cleaned = list(map(lambda x: list(filter(lambda y: y > 0, x))[:5000],
                                          papers_word_index_list_with_undefined))

In [7]:
papers_word_index_list_padded = list(map(lambda x: x + [0]*(5000-len(x)), papers_word_index_list_cleaned))
papers_word_index_list = np.array(papers_word_index_list_padded)

In [8]:
np.save('./word_index_list_test.npy', papers_word_index_list)

In [9]:
np.save('./labels_test.npy', np.array(papers_labels))