In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import sys
import os
import math
import spacy
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

sys.path.append("../../python/")

In [2]:
from workbench.data.dataframe import TextDataFrame
from workbench.tf.data.text_dataset import *
from workbench.data.nlp.spacy import *

# Data Path

In [3]:
original_train_data = pd.read_csv("input/train.csv")
original_test_data = pd.read_csv("input/test.csv")

# Tokenize with spaCy

In [4]:
if not os.path.exists("input/train_tokenized.csv"):
    nlp_processed_train_data = tokenize(original_train_data, "text")
    nlp_processed_train_data.to_csv("input/train_tokenized.csv")

In [5]:
if not os.path.exists("input/test_tokenized.csv"):
    nlp_processed_test_data =  tokenize(original_test_data, "text")
    nlp_processed_test_data.to_csv("input/test_tokenized.csv")

# Prepare our TextDataFrame 

In [6]:
text_col = "nlp_processed"

In [7]:
dataset_df  = TextDataFrame(train_df_path="input/train_tokenized.csv", 
                 test_df_path="input/test_tokenized.csv", 
                 text_col=text_col, 
                 category_col='author')

Fitting LabelEncoder and LabelBinarizer...
Done!
Splitting the data set(stratified sampling)...
Done!


In [8]:
train_df = dataset_df.get_train_df()
test_df = dataset_df.get_test_df()

In [9]:
train_df.count()

Unnamed: 0       19579
id               19579
text             19579
author           19579
nlp_processed    19579
dtype: int64

In [10]:
%time
from sklearn.decomposition import TruncatedSVD
import pickle

if os.path.exists("train_svd.pickle"):
    train_svd = pickle.load(open("train_svd.pickle", "rb"))
    test_svd =  pickle.load(open("test_svd.pickle", "rb"))
    eval_svd =  pickle.load(open("eval_svd.pickle", "rb"))
    print("Done!!! Reading pickled file...")
else:    
    print("Fit transform the tfidf vectorizer ###")
    tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
    full_tfidf = tfidf_vec.fit_transform(train_df[text_col].values.tolist() + test_df[text_col].values.tolist())

    train_tfidf = tfidf_vec.transform(dataset_df._get_train_data())
    test_tfidf = tfidf_vec.transform(dataset_df._get_test_data())
    eval_tfidf = tfidf_vec.transform(dataset_df._get_val_data())

    n_comp = 300
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(full_tfidf)

    train_svd = svd_obj.transform(train_tfidf)
    test_svd = svd_obj.transform(test_tfidf)
    eval_svd = svd_obj.transform(eval_tfidf)
    pickle.dump(train_svd, open("train_svd.pickle", "wb"))
    pickle.dump(test_svd, open("test_svd.pickle", "wb"))
    pickle.dump(eval_svd, open("eval_svd.pickle", "wb"))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.53 µs
Done!!! Reading pickled file...


In [11]:
# from sklearn.feature_selection import SelectKBest
# selector = SelectKBest(k=300)
# selector_model = selector.fit(full_tfidf, )

# train_tfidf = selector_model.transform(train_tfidf)
# test_tfidf = selector_model.transform(test_tfidf)
# eval_tfidf = selector_model.transform(eval_tfidf)


# Tensorflow Env setup

In [12]:
# Show debugging output
tf.logging.set_verbosity(tf.logging.DEBUG)

# Set default flags for the output directories
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string(
    flag_name='model_dir', 
    default_value='./fast_text',
    docstring='Output directory for model and training stats.')

tf.app.flags.DEFINE_string(
    flag_name='data_dir', 
    default_value='./spooky_data',
    docstring='Directory to download the data to.')

BATCH_SIZE = 64


In [13]:
MAX_DOCUMENT_LENGTH = 150
# #What would be the leftout documents if we choose 'MAX_DOCUMENT_LENGTH' as doc length
# train_df[train_df["length"] > MAX_DOCUMENT_LENGTH].count()[0]

# Generate Vocab 

In [14]:
%time
VOCAB_SIZE = save_vocab(dataset_df.train_df[text_col].as_matrix(), 
                       outfilename='horror_vocab.tsv', 
                       MAX_DOCUMENT_LENGTH=MAX_DOCUMENT_LENGTH) #

VOCAB_SIZE

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16.2 µs



25140 words into horror_vocab.tsv


25140

# Prepare Input Graphs for TF Experiment

In [15]:
train_input_fn, train_input_hook = setup_input_graph2(dataset_df._get_train_data(), 
                                                      train_svd,
                                                      dataset_df._get_one_hot_train_label(),
                                                      batch_size=BATCH_SIZE, 
                                                      scope='train-data')

Size of train data: 2.389MB
Labels and their document counts based on author
EAP    6320
HPL    4508
MWS    4835
Name: author, dtype: int64
INFO:tensorflow:text_features.shape: (15663,)
INFO:tensorflow:numeric_features.shape: (15663, 300)
INFO:tensorflow:labels.shape: (15663, 3)


In [16]:
eval_input_fn, eval_input_hook =  setup_input_graph2(dataset_df._get_val_data(), 
                                                     eval_svd,
                                                    dataset_df._get_one_hot_val_label(),
                                                    batch_size=BATCH_SIZE, 
                                                    scope='eval-data')

Size of validation data: 0.601MB
Labels and their document counts based on author
EAP    1580
HPL    1127
MWS    1209
Name: author, dtype: int64
INFO:tensorflow:text_features.shape: (3916,)
INFO:tensorflow:numeric_features.shape: (3916, 300)
INFO:tensorflow:labels.shape: (3916, 3)


In [17]:
# from workbench.tf.text_classification.multi_class_cnn_rnn import TextCNNRNN
# model = TextCNNRNN("horror_vocab.tsv", 
#                  VOCAB_SIZE, 
#                  train_input_fn, 
#                  train_input_hook, 
#                  eval_input_fn, 
#                  eval_input_hook,
#                    train_steps=4500,
#                    min_eval_frequency=100,
#                   max_doc_length=MAX_DOCUMENT_LENGTH)

In [18]:
from workbench.tf.text_classification.bilstm import BiLstm
model = BiLstm("horror_vocab.tsv", 
                 VOCAB_SIZE, 
                 train_input_fn, 
                 train_input_hook, 
                 eval_input_fn, 
                 eval_input_hook,
                   train_steps=7000,
                   min_eval_frequency=300,
                  max_doc_length=MAX_DOCUMENT_LENGTH)

In [19]:
# from workbench.tf.text_classification.fast_text import FastText
# tf.reset_default_graph()

# model = FastText("horror_vocab.tsv", 
#                  VOCAB_SIZE, 
#                  train_input_fn, 
#                  train_input_hook, 
#                  eval_input_fn, 
#                  eval_input_hook,
#                    train_steps=2000,
#                    min_eval_frequency=50,
#                   max_doc_length=MAX_DOCUMENT_LENGTH)

In [20]:
%time
# try:
model.run()
# except:
#     print('Trinaing done!')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11.7 µs
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe91977f860>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 128, '_save_checkpoints_secs': None, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': 300, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': './fast_text'}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
DEBUG:tensorflow:text_features -----> Tensor("train-data/IteratorGetNext:1", shape=(?,), dtype=string, device=/device:CPU:0)
DEBUG:tensorflow:numeric_features -----> Tensor("train-data/IteratorGetNext:0", shape=(

INFO:tensorflow:Evaluation [106/300]
INFO:tensorflow:Evaluation [107/300]
INFO:tensorflow:Evaluation [108/300]
INFO:tensorflow:Evaluation [109/300]
INFO:tensorflow:Evaluation [110/300]
INFO:tensorflow:Evaluation [111/300]
INFO:tensorflow:Evaluation [112/300]
INFO:tensorflow:Evaluation [113/300]
INFO:tensorflow:Evaluation [114/300]
INFO:tensorflow:Evaluation [115/300]
INFO:tensorflow:Evaluation [116/300]
INFO:tensorflow:Evaluation [117/300]
INFO:tensorflow:Evaluation [118/300]
INFO:tensorflow:Evaluation [119/300]
INFO:tensorflow:Evaluation [120/300]
INFO:tensorflow:Evaluation [121/300]
INFO:tensorflow:Evaluation [122/300]
INFO:tensorflow:Evaluation [123/300]
INFO:tensorflow:Evaluation [124/300]
INFO:tensorflow:Evaluation [125/300]
INFO:tensorflow:Evaluation [126/300]
INFO:tensorflow:Evaluation [127/300]
INFO:tensorflow:Evaluation [128/300]
INFO:tensorflow:Evaluation [129/300]
INFO:tensorflow:Evaluation [130/300]
INFO:tensorflow:Evaluation [131/300]
INFO:tensorflow:Evaluation [132/300]
I

INFO:tensorflow:table info: <tensorflow.python.ops.lookup_ops.IdTableWithHashBuckets object at 0x7fe9150b3780>
DEBUG:tensorflow:words_embed=Tensor("embed-layer/EmbedSequence/embedding_lookup:0", shape=(?, ?, 64), dtype=float32, device=/device:CPU:0)
INFO:tensorflow:encoding: ------> Tensor("lstm-layer/concat:0", shape=(?, 128), dtype=float32)
INFO:tensorflow:logits: ------> Tensor("logits-layer/dense/BiasAdd:0", shape=(?, 3), dtype=float32)
INFO:tensorflow:predicted_class: ------> Tensor("output-layer/class_output:0", shape=(?,), dtype=int64)
INFO:tensorflow:predicted_probabilities: ------> Tensor("output-layer/softmax_output:0", shape=(?, 3), dtype=float32)
INFO:tensorflow:Summary name lstm-layer/concat:0 is illegal; using lstm-layer/concat_0 instead.
INFO:tensorflow:Summary name output-layer/softmax_output:0 is illegal; using output-layer/softmax_output_0 instead.
INFO:tensorflow:Starting evaluation at 2017-11-08-06:12:40
INFO:tensorflow:Restoring parameters from ./fast_text/model.ck

INFO:tensorflow:Evaluation [198/300]
INFO:tensorflow:Evaluation [199/300]
INFO:tensorflow:Evaluation [200/300]
INFO:tensorflow:Evaluation [201/300]
INFO:tensorflow:Evaluation [202/300]
INFO:tensorflow:Evaluation [203/300]
INFO:tensorflow:Evaluation [204/300]
INFO:tensorflow:Evaluation [205/300]
INFO:tensorflow:Evaluation [206/300]
INFO:tensorflow:Evaluation [207/300]
INFO:tensorflow:Evaluation [208/300]
INFO:tensorflow:Evaluation [209/300]
INFO:tensorflow:Evaluation [210/300]
INFO:tensorflow:Evaluation [211/300]
INFO:tensorflow:Evaluation [212/300]
INFO:tensorflow:Evaluation [213/300]
INFO:tensorflow:Evaluation [214/300]
INFO:tensorflow:Evaluation [215/300]
INFO:tensorflow:Evaluation [216/300]
INFO:tensorflow:Evaluation [217/300]
INFO:tensorflow:Evaluation [218/300]
INFO:tensorflow:Evaluation [219/300]
INFO:tensorflow:Evaluation [220/300]
INFO:tensorflow:Evaluation [221/300]
INFO:tensorflow:Evaluation [222/300]
INFO:tensorflow:Evaluation [223/300]
INFO:tensorflow:Evaluation [224/300]
I

INFO:tensorflow:Evaluation [65/300]
INFO:tensorflow:Evaluation [66/300]
INFO:tensorflow:Evaluation [67/300]
INFO:tensorflow:Evaluation [68/300]
INFO:tensorflow:Evaluation [69/300]
INFO:tensorflow:Evaluation [70/300]
INFO:tensorflow:Evaluation [71/300]
INFO:tensorflow:Evaluation [72/300]
INFO:tensorflow:Evaluation [73/300]
INFO:tensorflow:Evaluation [74/300]
INFO:tensorflow:Evaluation [75/300]
INFO:tensorflow:Evaluation [76/300]
INFO:tensorflow:Evaluation [77/300]
INFO:tensorflow:Evaluation [78/300]
INFO:tensorflow:Evaluation [79/300]
INFO:tensorflow:Evaluation [80/300]
INFO:tensorflow:Evaluation [81/300]
INFO:tensorflow:Evaluation [82/300]
INFO:tensorflow:Evaluation [83/300]
INFO:tensorflow:Evaluation [84/300]
INFO:tensorflow:Evaluation [85/300]
INFO:tensorflow:Evaluation [86/300]
INFO:tensorflow:Evaluation [87/300]
INFO:tensorflow:Evaluation [88/300]
INFO:tensorflow:Evaluation [89/300]
INFO:tensorflow:Evaluation [90/300]
INFO:tensorflow:Evaluation [91/300]
INFO:tensorflow:Evaluation [

INFO:tensorflow:Evaluation [288/300]
INFO:tensorflow:Evaluation [289/300]
INFO:tensorflow:Evaluation [290/300]
INFO:tensorflow:Evaluation [291/300]
INFO:tensorflow:Evaluation [292/300]
INFO:tensorflow:Evaluation [293/300]
INFO:tensorflow:Evaluation [294/300]
INFO:tensorflow:Evaluation [295/300]
INFO:tensorflow:Evaluation [296/300]
INFO:tensorflow:Evaluation [297/300]
INFO:tensorflow:Evaluation [298/300]
INFO:tensorflow:Evaluation [299/300]
INFO:tensorflow:Evaluation [300/300]
INFO:tensorflow:Finished evaluation at 2017-11-08-06:15:52
INFO:tensorflow:Saving dict for global step 601: Accuracy = 0.812865, Precision = 0.901005, Recall = 0.840901, global_step = 601, loss = 0.436159
INFO:tensorflow:Validation (step 900): Accuracy = 0.812865, Precision = 0.901005, Recall = 0.840901, loss = 0.436159, global_step = 601
INFO:tensorflow:Saving checkpoints for 901 into ./fast_text/model.ckpt.
INFO:tensorflow:global_step/sec: 1.22534
INFO:tensorflow:loss = 0.108577, step = 901 (81.609 sec)
INFO:ten

INFO:tensorflow:Evaluation [156/300]
INFO:tensorflow:Evaluation [157/300]
INFO:tensorflow:Evaluation [158/300]
INFO:tensorflow:Evaluation [159/300]
INFO:tensorflow:Evaluation [160/300]
INFO:tensorflow:Evaluation [161/300]
INFO:tensorflow:Evaluation [162/300]
INFO:tensorflow:Evaluation [163/300]
INFO:tensorflow:Evaluation [164/300]
INFO:tensorflow:Evaluation [165/300]
INFO:tensorflow:Evaluation [166/300]
INFO:tensorflow:Evaluation [167/300]
INFO:tensorflow:Evaluation [168/300]
INFO:tensorflow:Evaluation [169/300]
INFO:tensorflow:Evaluation [170/300]
INFO:tensorflow:Evaluation [171/300]
INFO:tensorflow:Evaluation [172/300]
INFO:tensorflow:Evaluation [173/300]
INFO:tensorflow:Evaluation [174/300]
INFO:tensorflow:Evaluation [175/300]
INFO:tensorflow:Evaluation [176/300]
INFO:tensorflow:Evaluation [177/300]
INFO:tensorflow:Evaluation [178/300]
INFO:tensorflow:Evaluation [179/300]
INFO:tensorflow:Evaluation [180/300]
INFO:tensorflow:Evaluation [181/300]
INFO:tensorflow:Evaluation [182/300]
I

INFO:tensorflow:Evaluation [22/300]
INFO:tensorflow:Evaluation [23/300]
INFO:tensorflow:Evaluation [24/300]
INFO:tensorflow:Evaluation [25/300]
INFO:tensorflow:Evaluation [26/300]
INFO:tensorflow:Evaluation [27/300]
INFO:tensorflow:Evaluation [28/300]
INFO:tensorflow:Evaluation [29/300]
INFO:tensorflow:Evaluation [30/300]
INFO:tensorflow:Evaluation [31/300]
INFO:tensorflow:Evaluation [32/300]
INFO:tensorflow:Evaluation [33/300]
INFO:tensorflow:Evaluation [34/300]
INFO:tensorflow:Evaluation [35/300]
INFO:tensorflow:Evaluation [36/300]
INFO:tensorflow:Evaluation [37/300]
INFO:tensorflow:Evaluation [38/300]
INFO:tensorflow:Evaluation [39/300]
INFO:tensorflow:Evaluation [40/300]
INFO:tensorflow:Evaluation [41/300]
INFO:tensorflow:Evaluation [42/300]
INFO:tensorflow:Evaluation [43/300]
INFO:tensorflow:Evaluation [44/300]
INFO:tensorflow:Evaluation [45/300]
INFO:tensorflow:Evaluation [46/300]
INFO:tensorflow:Evaluation [47/300]
INFO:tensorflow:Evaluation [48/300]
INFO:tensorflow:Evaluation [

INFO:tensorflow:Evaluation [246/300]
INFO:tensorflow:Evaluation [247/300]
INFO:tensorflow:Evaluation [248/300]
INFO:tensorflow:Evaluation [249/300]
INFO:tensorflow:Evaluation [250/300]
INFO:tensorflow:Evaluation [251/300]
INFO:tensorflow:Evaluation [252/300]
INFO:tensorflow:Evaluation [253/300]
INFO:tensorflow:Evaluation [254/300]
INFO:tensorflow:Evaluation [255/300]
INFO:tensorflow:Evaluation [256/300]
INFO:tensorflow:Evaluation [257/300]
INFO:tensorflow:Evaluation [258/300]
INFO:tensorflow:Evaluation [259/300]
INFO:tensorflow:Evaluation [260/300]
INFO:tensorflow:Evaluation [261/300]
INFO:tensorflow:Evaluation [262/300]
INFO:tensorflow:Evaluation [263/300]
INFO:tensorflow:Evaluation [264/300]
INFO:tensorflow:Evaluation [265/300]
INFO:tensorflow:Evaluation [266/300]
INFO:tensorflow:Evaluation [267/300]
INFO:tensorflow:Evaluation [268/300]
INFO:tensorflow:Evaluation [269/300]
INFO:tensorflow:Evaluation [270/300]
INFO:tensorflow:Evaluation [271/300]
INFO:tensorflow:Evaluation [272/300]
I

INFO:tensorflow:Evaluation [114/300]
INFO:tensorflow:Evaluation [115/300]
INFO:tensorflow:Evaluation [116/300]
INFO:tensorflow:Evaluation [117/300]
INFO:tensorflow:Evaluation [118/300]
INFO:tensorflow:Evaluation [119/300]
INFO:tensorflow:Evaluation [120/300]
INFO:tensorflow:Evaluation [121/300]
INFO:tensorflow:Evaluation [122/300]
INFO:tensorflow:Evaluation [123/300]
INFO:tensorflow:Evaluation [124/300]
INFO:tensorflow:Evaluation [125/300]
INFO:tensorflow:Evaluation [126/300]
INFO:tensorflow:Evaluation [127/300]
INFO:tensorflow:Evaluation [128/300]
INFO:tensorflow:Evaluation [129/300]
INFO:tensorflow:Evaluation [130/300]
INFO:tensorflow:Evaluation [131/300]
INFO:tensorflow:Evaluation [132/300]
INFO:tensorflow:Evaluation [133/300]
INFO:tensorflow:Evaluation [134/300]
INFO:tensorflow:Evaluation [135/300]
INFO:tensorflow:Evaluation [136/300]
INFO:tensorflow:Evaluation [137/300]
INFO:tensorflow:Evaluation [138/300]
INFO:tensorflow:Evaluation [139/300]
INFO:tensorflow:Evaluation [140/300]
I

INFO:tensorflow:encoding: ------> Tensor("lstm-layer/concat:0", shape=(?, 128), dtype=float32)
INFO:tensorflow:logits: ------> Tensor("logits-layer/dense/BiasAdd:0", shape=(?, 3), dtype=float32)
INFO:tensorflow:predicted_class: ------> Tensor("output-layer/class_output:0", shape=(?,), dtype=int64)
INFO:tensorflow:predicted_probabilities: ------> Tensor("output-layer/softmax_output:0", shape=(?, 3), dtype=float32)
INFO:tensorflow:Summary name lstm-layer/concat:0 is illegal; using lstm-layer/concat_0 instead.
INFO:tensorflow:Summary name output-layer/softmax_output:0 is illegal; using output-layer/softmax_output_0 instead.
INFO:tensorflow:Starting evaluation at 2017-11-08-06:24:23
INFO:tensorflow:Restoring parameters from ./fast_text/model.ckpt-1801
INFO:tensorflow:
INFO:tensorflow:Evaluation [1/300]
INFO:tensorflow:Evaluation [2/300]
INFO:tensorflow:Evaluation [3/300]
INFO:tensorflow:Evaluation [4/300]
INFO:tensorflow:Evaluation [5/300]
INFO:tensorflow:Evaluation [6/300]
INFO:tensorflow

INFO:tensorflow:Evaluation [205/300]
INFO:tensorflow:Evaluation [206/300]
INFO:tensorflow:Evaluation [207/300]
INFO:tensorflow:Evaluation [208/300]
INFO:tensorflow:Evaluation [209/300]
INFO:tensorflow:Evaluation [210/300]
INFO:tensorflow:Evaluation [211/300]
INFO:tensorflow:Evaluation [212/300]
INFO:tensorflow:Evaluation [213/300]
INFO:tensorflow:Evaluation [214/300]
INFO:tensorflow:Evaluation [215/300]
INFO:tensorflow:Evaluation [216/300]
INFO:tensorflow:Evaluation [217/300]
INFO:tensorflow:Evaluation [218/300]
INFO:tensorflow:Evaluation [219/300]
INFO:tensorflow:Evaluation [220/300]
INFO:tensorflow:Evaluation [221/300]
INFO:tensorflow:Evaluation [222/300]
INFO:tensorflow:Evaluation [223/300]
INFO:tensorflow:Evaluation [224/300]
INFO:tensorflow:Evaluation [225/300]
INFO:tensorflow:Evaluation [226/300]
INFO:tensorflow:Evaluation [227/300]
INFO:tensorflow:Evaluation [228/300]
INFO:tensorflow:Evaluation [229/300]
INFO:tensorflow:Evaluation [230/300]
INFO:tensorflow:Evaluation [231/300]
I

INFO:tensorflow:Evaluation [72/300]
INFO:tensorflow:Evaluation [73/300]
INFO:tensorflow:Evaluation [74/300]
INFO:tensorflow:Evaluation [75/300]
INFO:tensorflow:Evaluation [76/300]
INFO:tensorflow:Evaluation [77/300]
INFO:tensorflow:Evaluation [78/300]
INFO:tensorflow:Evaluation [79/300]
INFO:tensorflow:Evaluation [80/300]
INFO:tensorflow:Evaluation [81/300]
INFO:tensorflow:Evaluation [82/300]
INFO:tensorflow:Evaluation [83/300]
INFO:tensorflow:Evaluation [84/300]
INFO:tensorflow:Evaluation [85/300]
INFO:tensorflow:Evaluation [86/300]
INFO:tensorflow:Evaluation [87/300]
INFO:tensorflow:Evaluation [88/300]
INFO:tensorflow:Evaluation [89/300]
INFO:tensorflow:Evaluation [90/300]
INFO:tensorflow:Evaluation [91/300]
INFO:tensorflow:Evaluation [92/300]
INFO:tensorflow:Evaluation [93/300]
INFO:tensorflow:Evaluation [94/300]
INFO:tensorflow:Evaluation [95/300]
INFO:tensorflow:Evaluation [96/300]
INFO:tensorflow:Evaluation [97/300]
INFO:tensorflow:Evaluation [98/300]
INFO:tensorflow:Evaluation [

INFO:tensorflow:Evaluation [295/300]
INFO:tensorflow:Evaluation [296/300]
INFO:tensorflow:Evaluation [297/300]
INFO:tensorflow:Evaluation [298/300]
INFO:tensorflow:Evaluation [299/300]
INFO:tensorflow:Evaluation [300/300]
INFO:tensorflow:Finished evaluation at 2017-11-08-06:27:01
INFO:tensorflow:Saving dict for global step 2101: Accuracy = 0.822448, Precision = 0.897646, Recall = 0.860817, global_step = 2101, loss = 0.647516
INFO:tensorflow:Validation (step 2400): Accuracy = 0.822448, Precision = 0.897646, Recall = 0.860817, loss = 0.647516, global_step = 2101
INFO:tensorflow:Saving checkpoints for 2401 into ./fast_text/model.ckpt.
INFO:tensorflow:global_step/sec: 1.60036
INFO:tensorflow:loss = 0.10326, step = 2401 (62.486 sec)
INFO:tensorflow:global_step/sec: 3.13255
INFO:tensorflow:loss = 0.0305876, step = 2501 (31.924 sec)
INFO:tensorflow:global_step/sec: 3.32027
INFO:tensorflow:loss = 0.0179907, step = 2601 (30.117 sec)


KeyboardInterrupt: 

# Test Data

In [None]:
test_input_fn =  test_inputs(dataset_df._get_test_data(), 
                                        batch_size=1, 
                                        scope='test-data')

In [None]:
predictions_fn = model.predict(test_input_fn)

In [None]:
%time
predictions = []
classes = []

for r in predictions_fn:
    predictions.append(r['probabilities'])
    classes.append(r['classes'])

In [None]:
ids = dataset_df.test_df['id']
results = pd.DataFrame(predictions, columns=['EAP', 'HPL','MWS'])
results.insert(0, "id", ids)


In [None]:
results.head()


In [None]:
results.to_csv("fast_text_tokenized.csv", index=False)

# Train Data Prediction for ensembling

In [None]:
train_input_fn_2 =  test_inputs(dataset_df._get_full_train_df()['text'].as_matrix(), 
                                        batch_size=1, 
                                        scope='test-data')

In [None]:
predictions_fn2 = model.predict(train_input_fn_2)

In [None]:
%time
predictions = []
classes = []

for r in predictions_fn2:
    predictions.append(r['probabilities'])
    classes.append(r['classes'])

In [None]:
ids =dataset_df._get_full_train_df()['id']
results = pd.DataFrame(predictions, columns=['EAP', 'HPL','MWS'])
results.insert(0, "id", ids)
results.head()

In [None]:
results.to_csv("trainset_fast_text_text_pos_bigrams.csv", index=False)

In [None]:
results.head()