# PREPROCESS TEXT

In [1]:
import os
import collections
from six.moves import cPickle
import numpy as np
print ("PACKAGES LOADED")

PACKAGES LOADED


# LOAD TEXT

In [2]:
data_dir    = "data/linux_kernel"
save_dir    = "data/linux_kernel"
input_file  = os.path.join(data_dir, "input.txt")
with open(input_file, "r") as f:
    data = f.read()
print ("TYPE OF DATA IS %s" % (type(data)))
print ("TEXT LOADED FROM [%s]" % (input_file))

TYPE OF DATA IS <class 'str'>
TEXT LOADED FROM [data/linux_kernel/input.txt]


# COUNT CHARACTERS

In [3]:
counter = collections.Counter(data)
print ("TYPE OF 'COUNTER.ITEM()' IS [%s] AND LENGTH IS [%d]" 
       % (type(counter.items()), len(counter.items()))) 
for i in range(5):
    print ("[%d]TH ELEMENT IS [%s]" % (i, counter.items()[i]))

TYPE OF 'COUNTER.ITEM()' IS [<class 'dict_items'>] AND LENGTH IS [98]


TypeError: 'dict_items' object does not support indexing

# SORT CHARACTER COUNTS

In [4]:
count_pairs = sorted(counter.items(), key=lambda x: -x[1]) 
print ("TYPE OF 'COUNT_PAIRS' IS [%s] AND LENGTH IS [%d]" 
       % (type(count_pairs), len(count_pairs))) 
for i in range(5):
    print ("[%d]TH ELEMENT IS [%s]" % (i, count_pairs[i]))

TYPE OF 'COUNT_PAIRS' IS [<class 'list'>] AND LENGTH IS [98]
[0]TH ELEMENT IS [(' ', 171222)]
[1]TH ELEMENT IS [('e', 113021)]
[2]TH ELEMENT IS [('t', 102154)]
[3]TH ELEMENT IS [('r', 76185)]
[4]TH ELEMENT IS [('i', 75486)]


# MAKE DICTIONARY
## : CHARS & VOCAB

In [5]:
chars, counts = zip(*count_pairs)
vocab = dict(zip(chars, range(len(chars))))
print ("TYPE OF 'CHARS' IS [%s] AND LENGTH IS [%d]" 
    % (type(chars), len(chars))) 
print ("TYPE OF 'COUNTS' IS [%s] AND LENGTH IS [%d]" 
    % (type(counts), len(counts))) 
print ("TYPE OF 'VOCAB' IS [%s] AND LENGTH IS [%d]" 
    % (type(vocab), len(vocab))) 

TYPE OF 'CHARS' IS [<class 'tuple'>] AND LENGTH IS [98]
TYPE OF 'COUNTS' IS [<class 'tuple'>] AND LENGTH IS [98]
TYPE OF 'VOCAB' IS [<class 'dict'>] AND LENGTH IS [98]


# USAGE OF 'CHARS' AND 'VOCAB

In [6]:
# CHARS: NUMBER -> CHAR
print ("==========CHARS USAGE==========")
for i in range(5):
    print (" [%d/%d]" % (i, 3)), # COMMA STOPS LINE CHANGE
    print ("CHARS[%d] IS [%s]" % (i, chars[i]))
# VOCAB: CHAR -> NUMBER
print ("==========VOCAB USAGE==========")
for i in range(5):
    print (" [%d/%d]" % (i, 3)), # <= This comma remove '\n'
    print ("VOCAB[%s] IS [%s]" % (chars[i], vocab[chars[i]]))

 [0/3]
CHARS[0] IS [ ]
 [1/3]
CHARS[1] IS [e]
 [2/3]
CHARS[2] IS [t]
 [3/3]
CHARS[3] IS [r]
 [4/3]
CHARS[4] IS [i]
 [0/3]
VOCAB[ ] IS [0]
 [1/3]
VOCAB[e] IS [1]
 [2/3]
VOCAB[t] IS [2]
 [3/3]
VOCAB[r] IS [3]
 [4/3]
VOCAB[i] IS [4]


# SAVE CHARS AND VOCAB

In [7]:
save_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(save_name, 'wb') as fsave:
    cPickle.dump((chars, vocab), fsave)
    print ("CHARS AND VOCAB ARE SAVED TO [%s]" % (save_name))

# LOAD 
load_name = os.path.join(save_dir, 'chars_vocab.pkl')
with open(load_name, 'rb') as fload:
    chars2, vocab2 = cPickle.load(fload)
    print ("CHARS AND VOCAB ARE LOADED FROM [%s]" % (load_name))
# CHARS: NUMBER -> CHAR
print ("==========CHARS2==========")
for i in range(5):
    print (" [%d/%d]" % (i, 3)), # COMMA STOPS LINE CHANGE
    print ("CHARS2[%d] IS [%s]" % (i, chars2[i]))
# VOCAB: CHAR -> NUMBER
print ("==========VOCAB2==========")
for i in range(5):
    print (" [%d/%d]" % (i, 3)), # <= This comma remove '\n'
    print ("VOCAB2[%s] IS [%s]" % (chars2[i], vocab2[chars2[i]]))

CHARS AND VOCAB ARE SAVED TO [data/linux_kernel/chars_vocab.pkl]
CHARS AND VOCAB ARE LOADED FROM [data/linux_kernel/chars_vocab.pkl]
 [0/3]
CHARS2[0] IS [ ]
 [1/3]
CHARS2[1] IS [e]
 [2/3]
CHARS2[2] IS [t]
 [3/3]
CHARS2[3] IS [r]
 [4/3]
CHARS2[4] IS [i]
 [0/3]
VOCAB2[ ] IS [0]
 [1/3]
VOCAB2[e] IS [1]
 [2/3]
VOCAB2[t] IS [2]
 [3/3]
VOCAB2[r] IS [3]
 [4/3]
VOCAB2[i] IS [4]


# DATA => CORPUS

In [8]:
corpus = np.array(list(map(vocab.get, data)))
print ("TYPE OF 'DATA' IS [%s] AND LENGTH IS [%d]" %(type(data), len(data)))
print ("TYPE OF 'CORPUS' IS [%s] AND LENGTH IS [%d]" %(type(corpus), len(data)))

print ("============DATA LOOKS LIKE============")
print (data[:50])
print ("============CORPUS LOOKS LIKE============")
print (corpus[:50])

TYPE OF 'DATA' IS [<class 'str'>] AND LENGTH IS [1708870]
TYPE OF 'CORPUS' IS [<class 'numpy.ndarray'>] AND LENGTH IS [1708870]
/*
 *  linux/kernel/acct.c
 *
 *  BSD Process Acco
[36 22  7  0 22  0  0 13  4  8 14 40 36 24  1  3  8  1 13 36  9 12 12  2 35
 12  7  0 22  7  0 22  0  0 64 42 56  0 50  3 11 12  1  5  5  0 48 12 12 11]


# SAVE

In [9]:
save_name = os.path.join(save_dir, 'corpus_data.pkl')
with open(save_name, 'wb') as fsave:
    cPickle.dump((corpus, data), fsave)
    print ("CORPUS AND DATA ARE SAVED TO [%s]" % (save_name)) 

CORPUS AND DATA ARE SAVED TO [data/linux_kernel/corpus_data.pkl]


In [None]:
test complete; Gopal