In [1]:
import pandas as pd

corpus_file = "../data/sample_corpus.txt"

In [2]:
#CORPUS PROCESSING:
#accepts: directory of dataset file, vocabulary as dict (optional).
#returns: corpus as list of tokens, length of longest sentence.
def corpus_prepare(fdir, voc=None):
  f = open(fdir, "r")
  corpus = []
  l = 0
  longest = 0
  print("Preparing corpus...")

  for aline in f:
    aline = aline.strip()
    tokens = aline.split()
    corpus.append('')
    l = 1
    for tok in tokens:
      #if voc, align corpus to vocabulary (replace OOV):
      if voc:
        if tok not in voc.keys():
          corpus.append('')
        else:
          corpus.append(tok)
      else:
        corpus.append(tok)
      l = l+1
    if l > longest:
      longest = l
    corpus.append('')
  
  f.close()
  return corpus, longest

In [3]:
#2) Vocabulary extractor (tokens and integers):
#accepts: corpus.
#returns: vocabulary in dict format (words2ind), with tokens as keys and integers as values.
def get_vocab(corpus):
  #1) PRODUCE VOC & W2I:
  vocab = {}
  print("Extracting vocabulary from corpus...")
  #append default toks:
  vocab[''] = 0
  vocab[''] = 1
  vocab[''] = 2
  vocab[''] = 3
  i = 4
  for tok in corpus:
    if tok not in vocab.keys():
      vocab[tok] = i
      i = i+1

  #2: PRODUCE I2W:
  i2w = {}

  for k, v in vocab.items():
    i2w[v] = k

  return vocab, i2w

In [4]:
#converts the full dataset to integers vector on the basis of the vocabulary:
#accepts: vocabulary as dict, boolean to set the reading source (True for file, False for corpus), then either directory of dataset file or corpus to convert.
#returns: list of integers corresponding to vocabulary word indices.
def vectorize(voc, fromfile, fdir=None, corpus=None):
  
  #mode: read from input file:
  if fromfile == True:
    #if no file provided:
    if not fdir:
      return -1
    else:
      #read file:
      corpus, longest = corpus_prepare(fdir, voc)
  else:
    #mode: read from input corpus:
    if not corpus:
      return -1

  #vectorize:
  vectd = []
  i = 0

  for tok in corpus:
    if tok not in voc.keys():
      #replace with :
      vectd.append(1)
    else:
      vectd.append(voc[tok])

  if fromfile == True: 
    return vectd, longest
  else:
    return vectd 

In [5]:
#MAIN:
trn_corpus, trn_sl = corpus_prepare(corpus_file)
print(trn_corpus[0:10])
print(trn_sl)

#voc is also the w2i vector:
voc, i2w = get_vocab(trn_corpus)

voc_w = list(voc.keys())
voc_i = list(voc.values())
vocl = len(voc_i)

print(list(voc.items())[0:10])
print(vocl)
print(list(i2w.items())[0:10])

Preparing corpus...
['', 'who', "'s", 'in', 'star', 'wars', 'episode', 'four', '', '']
23
Extracting vocabulary from corpus...
[('', 3), ('who', 4), ("'s", 5), ('in', 6), ('star', 7), ('wars', 8), ('episode', 9), ('four', 10), ('was', 11), ('Apollo', 12)]
1040
[(3, ''), (4, 'who'), (5, "'s"), (6, 'in'), (7, 'star'), (8, 'wars'), (9, 'episode'), (10, 'four'), (11, 'was'), (12, 'Apollo')]


In [6]:
#Vector converter from integers to tokens:
#accepts: vector of integers and i2w dict.
#returns: converted vector of tokens.
def deindexer(vect, i2w):
  #Init:
  conv = []

  for i in vect:
    conv.append(i2w[i])
  
  return conv

In [9]:
trn_vect = vectorize(voc, False, corpus=trn_corpus)
print(trn_vect[0:50])

trn_w = deindexer(trn_vect, i2w)
print(trn_w[0:50])

#MAX SEQUENCE SIZE:
print(trn_sl)
print("MAX SENTENCE LENGTH: {}".format(trn_sl))

[3, 4, 5, 6, 7, 8, 9, 10, 3, 3, 4, 11, 6, 12, 13, 3, 3, 4, 11, 12, 13, 5, 14, 3, 3, 15, 16, 17, 18, 19, 14, 20, 21, 22, 23, 13, 3, 3, 24, 25, 26, 27, 28, 29, 18, 19, 14, 20, 21, 22]
['', 'who', "'s", 'in', 'star', 'wars', 'episode', 'four', '', '', 'who', 'was', 'in', 'Apollo', 'thirteen', '', '', 'who', 'was', 'Apollo', 'thirteen', "'s", 'cast', '', '', 'search', 'for', 'information', 'about', 'the', 'cast', 'and', 'crew', 'of', 'appolo', 'thirteen', '', '', 'i', 'would', 'like', 'to', 'know', 'more', 'about', 'the', 'cast', 'and', 'crew', 'of']
23
MAX SENTENCE LENGTH: 23
