In [1]:
import json
import os
import random
import pandas as pd
import numpy as np

In [2]:
from bert_serving.client import BertClient
bc = BertClient()

In [3]:
DATA_DIR = "data/"

def get_datasets(type='medium'):
    if type not in ['medium', 'all', 'mini']:
        raise Exception("Invalid type!")
    cols =  ['requester_received_pizza', 'request_text']
    train_df= pd.read_csv(DATA_DIR + f'train_{type}.csv', usecols = cols).reindex(columns=cols)
    train_df[cols[0]] = (train_df[cols[0]] == True ).astype(int)
    dev_df = pd.read_csv(DATA_DIR + f'val_{type}.csv', usecols = cols).reindex(columns=cols)
    dev_df[cols[0]] = (dev_df[cols[0]] == True ).astype(int)
    return train_df, dev_df

In [4]:
train_df, dev_df = get_datasets(type='medium')

In [5]:
def get_encodes(df):
    samples = list(df['request_text'])
    text = [s[:50] + s[-50:] for s in samples]
    features = bc.encode(text)
    return features
  

In [6]:
import tensorflow as tf
from tensorflow.python.estimator.canned.dnn import DNNClassifier
from tensorflow.python.estimator.run_config import RunConfig
from tensorflow.python.estimator.training import TrainSpec, EvalSpec, train_and_evaluate


classifier = DNNClassifier(
    hidden_units=[64, 32, 16 ],
    dropout= 0.1,
    optimizer=tf.train.AdamOptimizer(1e-4),
    feature_columns=[tf.feature_column.numeric_column('feature', shape=(768,))],
    n_classes=2)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/c1/qyvzj96j1bj6cl7g51yg09pw0000gn/T/tmp6bwornrs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x135374b38>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [7]:
def input(df):
    features = get_encodes(df)
    return features, np.array(df['requester_received_pizza'].astype(np.int32))

In [8]:
train_input_fn = tf.estimator.inputs.numpy_input_fn(
 x={"feature": input(train_df)[0]},
 y=input(train_df)[1],
 num_epochs=3,
 batch_size=32,
 shuffle=True
)

In [9]:
classifier.train(input_fn=train_input_fn, steps=10)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/c1/qyvzj96j1bj6cl7g51yg09pw0000gn/T/tmp6bwornrs/model.ckpt.
INFO:tensorflow:loss = 22.694159, step = 1
INFO:tensorflow:Saving checkpoints for 10 into /var/folders/c1/qyvzj96j1bj6cl7g51yg09pw0000gn/T/tmp6bwornrs/model.ckp

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x10ca4c630>

In [10]:
test_input_fn = tf.estimator.inputs.numpy_input_fn(
 x={"feature": input(dev_df)[0]},
 y=input(dev_df)[1],
 num_epochs=1,
 shuffle=False
)

In [11]:
accuracy_score = classifier.evaluate(input_fn=test_input_fn)["accuracy"]

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-11-15T05:49:04Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /var/folders/c1/qyvzj96j1bj6cl7g51yg09pw0000gn/T/tmp6bwornrs/model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-11-15-05:49:05
INFO:tensorflow:Saving dict for global step 10: accuracy = 0.7181818, accuracy_baseline = 0.75454545, auc = 0.49464527, auc_precision_recall = 0.24579881, average_loss = 0.6465891, global_step = 10, label/mean = 0.24545455, loss = 71.1248, precision = 0.3, prediction/mean = 0.44629392, recall = 0.11111111
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10: /var/folders/c1/qyvzj96j1bj6cl

In [12]:
print("\nTest Accuracy: {0:f}%\n".format(accuracy_score*100))



Test Accuracy: 71.818179%



In [13]:
batch_size = 100
n_iters = 3000
epochs = n_iters / (len(train_df) / batch_size)
input_dim = 768
output_dim = 2
lr_rate = 0.001

In [14]:
from sklearn.linear_model import LogisticRegression

train_features, train_y = input(train_df)
logreg = LogisticRegression()
logreg.fit(train_features, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
dev_features, dev_y = input(dev_df)

In [16]:
y_pred = logreg.predict(dev_features)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

score = accuracy_score(dev_y,y_pred)

In [22]:
print (score)

0.5814368585452924


In [24]:
print (score)

0.7454545454545455
