### data source Stanford IMDB Movie review

In [14]:

import re
import os
def get_full_data(directory, max_reviews):
  reviews = []
  remove_characters = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~" 
  punc_table = {ord(char): None for char in remove_characters}
  
  ctr = 1
  for txt_file in os.listdir(directory):
    if ctr > max_reviews: break
    curr_file = os.path.join(directory, txt_file)
    f = open(curr_file, "r", encoding="utf8")  # one line
    for line in f:
      line = line.strip()
      if len(line) > 0: 
        line = line.translate(punc_table) 
        line = line.lower()
        line = ",".join(line.split())
        word_list = line.split(" ") 
        reviews.append(word_list)
    f.close()
    ctr += 1
  return reviews

In [12]:
def generate_file(reviews_lists, outpt_file, w_or_a, 
  vocab_dict, max_review_len, label_char):

  # write first time, append later. could use "a+" mode instead.
  fout = open(outpt_file, w_or_a, encoding="utf8")  
  offset = 3  # Keras offset: 'the' = 1 (most frequent) 1+3 = 4
      
  for i in range(len(reviews_lists)):  # walk each review-list
    curr_review = reviews_lists[i]
    n_words = len(curr_review)     
    if n_words > max_review_len:
      continue  # next i, continue without writing anything

    n_pad = max_review_len - n_words   # number 0s to prepend

    for j in range(n_pad):
      fout.write("0 ")
    
    for word in curr_review: 
      # a word in test set might not have been in train set     
      if word not in vocab_dict:  
        fout.write("2 ")   # out-of-vocab index        
      else:
        idx = vocab_dict[word] + offset
        fout.write("%d " % idx)
    
    fout.write(label_char + "\n")  # '0' or '1
        
  fout.close()

In [10]:
def run_all():
  print("Loading all reviews into memory ")
  pos_train_reviews = get_full_data("data/aclImdb/train/pos", 12500)
  neg_train_reviews = get_full_data("data/aclImdb/train/neg", 12500)
  pos_test_reviews = get_full_data("data/aclImdb/test/pos", 12500)
  neg_test_reviews = get_full_data("data/aclImdb/test/neg", 12500)
 

  print("Analyzing reviews and making vocabulary ")
  vocab_dict = make_vocab([pos_train_reviews, 
    neg_train_reviews])  # key = word, value = word rank
  v_len = len(vocab_dict)  
  # need this value, plus 4, for Embedding: 129888+4 = 129892
  print("Vocab size = %d -- use this +4 for \
    Embedding nw " % v_len)

  max_review_len = 500  # exact fixed length

  print("Generating training file len %d words or less " \
    % max_review_len)

  generate_file(pos_train_reviews, "imdb_train_20w.txt", 
    "w", vocab_dict, max_review_len, "1")
  generate_file(neg_train_reviews, "imdb_train_20w.txt",
    "a", vocab_dict, max_review_len, "0")

  print("Generating test file with len %d words or less " \
    % max_review_len)

  generate_file(pos_test_reviews, "imdb_test_500w.txt", 
    "w", vocab_dict, max_review_len, "1")
  generate_file(neg_test_reviews, "imdb_test_500w.txt", 
    "a", vocab_dict, max_review_len, "0")

  # inspect a generated file
  # vocab_dict was used indirectly (offset)

  print("Displaying encoded training file: \n")
  f = open("imdb_train_500w.txt", "r", encoding="utf8")
  for line in f: 
    print(line, end="")
  f.close()




In [None]:
if __name__ == "__main__":
  run_all()