In [10]:
import re
import numpy as np
import pickle

In [3]:
#acknowledgement
#https://github.com/nsanghi/HSE-NLP-Coursera/blob/master/week1/week1-MultilabelClassification-Solution.ipynb

In [4]:
def load_corpus(file_path):
    """ 
        file_path: the text file path of the corpus
        return: list of lines.
    """
    result = []
    with open(file_path,'r') as f:
        for line in f:
            result.append(line)
            
    return result
            
def construct_vocab_counts(list_of_lines):
    """
        list_of_lines: list of tokenized lines
        
        return: Counter of tokens
    """
    from collections import Counter
    words_counts = Counter([word for line in list_of_lines for word in line.split(' ')])
    return words_counts

def construct_vocab_list(words_counts, min_thres=0):
    """
        words_counts: counter of words
        min_thres: word occuring less than min_thres times will not be in the vocab
    """
    vocab_list = [x for x in words_counts.keys() if words_counts[x] >= min_thres]
    if 'UNK' not in vocab_list:
        vocab_list.append('<UNK>')
    return vocab_list

def construct_vocab_dict(vocab_list):
    """
        vocab_list: list of vocabularies
        return: two dictionaries of mappings between words and indices
    """
    word_to_idx_dict = {item:ii for ii, item in enumerate(vocab_list)}
    idx_to_word_dict = {ii:word for word, ii in word_to_idx_dict.items()}
    
    return word_to_idx_dict, idx_to_word_dict

            

In [8]:
def corpus_to_word_indices(list_of_lines, word_to_idx_dict, train=True):
    if train:
        result = [word_to_idx_dict[word] for line in list_of_lines for word in line.split(' ')]
    else:
        result = [word_to_idx_dict[word] if word in word_to_idx_dict.keys() else word_to_idx_dict['<UNK>']
                for line in list_of_lines for word in line.split(' ')]
        
    return result

def write_to_file(result, file_name):
    pickle.dump(result, open(file_name, 'wb'))


In [18]:
infile = '../group4.test.txt'
outfile = '../testing_step2.p'
train='test'
print("loading file...")
lines = load_corpus(infile)
print("counting words...")
words_counts = construct_vocab_counts(lines)
print("producing vocabulary list...")
vocab_list = construct_vocab_list(words_counts, min_thres=0)
print("vocan list contains {} words".format(len(vocab_list)))
wi_dict, iw_dict = construct_vocab_dict(vocab_list)
corpus_word_indices = corpus_to_word_indices(lines,wi_dict, train=='train')
print("showing the first ten indices:")
print(corpus_word_indices[:10])
print("reconstruct token from the above indices:")
print(' '.join([iw_dict[i] for i in corpus_word_indices[:10]]))
print("writing corpus word indices to file...")
write_to_file(corpus_word_indices, outfile)
print("done")




loading file...
counting words...
producing vocabulary list...
vocan list contains 17863 words
showing the first ten indices:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
reconstruct token from the above indices:
<s> Kick @-@ Ass is a 2010 superhero black comedy
writing corpus word indices to file...
done


In [13]:
corpus_word_indices[:20]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 13, 17, 18]

In [15]:
' '.join([iw_dict[i] for i in corpus_word_indices[:40]])

'<s> Kick @-@ Ass is a 2010 superhero black comedy film based on the comic book of the same name by Mark Millar and John Romita , Jr . </s> <s> which was published by Marvel Comics . </s> <s>'