In [1]:
# Example of simple model load and evaluate

# ===============LICENSE_START=======================================================
# Apache-2.0
# ===================================================================================
# Copyright (C) 2019 AT&T Intellectual Property  All rights reserved.
# ===================================================================================
# This software file is distributed by AT&T
# under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# This file is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ===============LICENSE_END=========================================================


In [2]:
import numpy as np
import pandas as pd
import os,sys,shutil  # file checks
import dill as pickle   # serialize functions and data as compressed binary 
import gzip  # compression 
import yaml   # configuration file
import time  # time tracking

import threading  # threaded process evals

from acumos.wrapped import load_model
from acumos.modeling import Model, List, Dict, create_namedtuple, create_dataframe
from acumos.session import AcumosSession, Requirements

import util_call
import util_review

# load our configuration
config_path = 'config.yaml'
if not os.path.isfile(config_path):
    print("Sorry, can't find the configuration file {}, aborting.".format(config_path))
    sys.exit(-1)
config = yaml.safe_load(open(config_path))

# Load Raw Data
Load the raw test data and double-check the schema of the data with a random sample.

In [3]:
## PART 1 - load and start a local model runner 
# https://pypi.org/project/acumos/#using-dataframes-with-scikit-learn

# read our larger datasets as binary files
with gzip.open(config["path"]["etl"], 'rb') as f:
    df = pickle.load(f)
print(df["X_test"].columns)
print(df["X_test"].sample(1).transpose())

Index(['helpful', 'reviewText', 'summary', 'unixReviewTime', 'categories',
       'description'],
      dtype='object')
row_id                                       -2253507452453936056
helpful                                                    [0, 1]
reviewText      Easy out of the box setup and connection to th...
summary                      Great printer, easy to setup and use
unixReviewTime                                         1405900800
categories                  [office products, office electronics]
description                                                      


# Create wrapped model protoype
Future versions of the API are addressing this issue, but for now, we'll need to mock-up what the call structure looks like for a given model.  For example, check the `Model Prototype Definition` section from the last  notebook for some additional discussion.  

*NOTE*: The most natural way to do get a model's signature and connection data is to find it on the marketplace and download the required files (e.g. protobuf definition, etc) from there directly.

Looking at a few example models for text-based sentiment processing we see a few common types there as well.

* **text-to-float** pattern: a textual string is input for the output of class probabilites
> TextIn = create_namedtuple('TextIn', [("TextIn", str)])
  FloatOut = create_namedtuple('FloatOut', [("FloatOut", List[float])])

* **text-to-float** pattern: a textual string is input for the output of class probabilites
> TextIn = create_namedtuple('TextIn', [("TextIn", str)])
  FloatOut = create_namedtuple('FloatOut', [("FloatOut", List[float])])



In [4]:
# go through and create just a few model templates
TextIn = create_namedtuple('TextIn', [("TextIn", str)])
FloatOut = create_namedtuple('FloatOut', [("FloatOut", List[float])])

# create function templates
def sent_predict(df: TextIn) -> FloatOut:
    '''Dummy function for prediction of a sentice'''
    return FloatOut([])
model = Model(sent_predict=sent_predict, classify=sent_predict)

# create model so that we can run it locally
session = AcumosSession()
model_dump = config["publish"]["name_model3"]+"_"+"text-to-float"
path_dump = os.path.join('data', model_dump)
if os.path.exists(path_dump):
    shutil.rmtree(path_dump)
session.dump(model, model_dump, 'data')  # creates ~/<name_publish>



# Load & Evaluate a Sentiment Model
Now that we have the model prototype 

1. Iterate through which text models (the shared ones) we want to analyze

2. For the raw training data and test data (the places where we have raw textual reviews), convienently wrapped in the helper function `call_sentiment_helper`
    1. Load the right stubbed model template (the one we just saved to disk above)
    2. Sub-sample the raw input data if a max number of items was provided (this speeds up the local demo)
    3. Call our model at a remote URL with the input data
    4. Depending on the model template (the call pattern), pull out specific floating values to keep (flatten)
    5. Return results

3. With the above results, write them to disk if it was a full dataset (because it takes a while) or display them to verify that we're doing the right thing!

4. Finally, cooalte the different results from each sentiment processor into a final data dictionary that other notebook scripts can utilize.

In [5]:
# first, we define a helper function that will load a model and call it against data
def call_sentiment_helper(model_name, df_eval, col_process, max_process_items, config, wrapped_model=None):
    # load model from disk, see that it is a nicely "wrapped" model
    model_remote_param = config["sentiment"][model_name]
    model_dump = config["publish"]["name_model3"]+"_"+model_remote_param["style"]
    
    # we allow the model to be passed because (a) they're all the same, (b) threading breaks with sessions
    if wrapped_model is None:
        wrapped_model = load_model(os.path.join('data', model_dump))

    # although there are a few text columns, we'll just send the the column `reviewText` in for analysis
    # NOTE: we're "wrapping" the one column as well for standard calling structure
    #    nd_sample = [ [text1], [text2], ... ]
    idx_access = list(range(len(df_eval)))
    if max_process_items != 0:   # 0 special case for EVERYTHING
        np.random.shuffle(idx_access)
        idx_access = idx_access[:min(len(idx_access), max_process_items)]
    print("Started processing data... ({} of {} samples)".format(len(idx_access), len(df_eval)))
    nd_sample = [[wrap_item] for wrap_item in df_eval.iloc[idx_access][col_process].values.tolist()]
    list_result, list_idx = util_call.score_model(wrapped_model, nd_sample, False,
                        name_function=model_remote_param["api"],
                        url_remote="{}:{}".format(
                            config["sentiment"]["deploy_host"], model_remote_param["port"]))
    index_df = [idx_access[i] for i in list_idx]  # remap our index in case anything was missed!
    df_result = pd.DataFrame(list_result, index=df_eval.index[index_df])
    # now pull out the iteresting parts according to known style/output
    col_new = ["{}_{}".format(model_name, c) for c in [col_process]]  # TODO: rework for multi-column?
    if model_remote_param["style"] == "text-to-float":
        df_result = pd.DataFrame(df_result["FloatOut"].values, columns=col_new, index=df_result.index)
    # TODO: other styles....
        
    # looks like yelp_textReview but can be multiple floats ....
    col_new = list(df_result.columns)
    for c in col_new:
        feat_n = len(df_result[c].sample(1))
        col_expanded = ["{}_{:03d}".format(c, idx) for idx in range(feat_n)]
        df_expand = pd.DataFrame(df_result[c].values.tolist(), 
                          columns=col_expanded, index=df_result.index)        
        df_result = df_result.join(df_expand)   # join new expanded columns
        del df_result[c]  # delete old singular column
    return df_result

def helper_thread(model_name, wrapped_model=None):
    print("=== Started processing for model '{}'... === ".format(model_name))
    path_sentiment = config["sentiment"][model_name]["path"]
    if not os.path.exists(path_sentiment) or max_process_items!=0:  # run training if overall missing or for demo
        df_scored = call_sentiment_helper(model_name, df_raw, 
            sentiment["col_sentiment"], max_process_items, config, wrapped_model=wrapped_model)
        if max_process_items==0:    # only write full datasets
            df_scored.to_csv(path_sentiment, index=True, header=True)
    
        # show a preview of what was just done...
        print("... sample for model '{}'".format(model_name))
        print(df_scored.join(df_raw).sample(3))

# okay, let's get ready to call our helper function for requested models
sentiment = {}
sentiment["col_sentiment"] = "reviewText"

# truncate range for faster evaluation
max_process_items = 10     # set to 0 for everything (warning it might take a while)

# actual evaluation code...
thread_list = []
thread_utilize = True
# load model (WARNING: if you needed another model style than what was created, you may need to rework this)
wrapped_model = load_model(os.path.join('data', model_dump))
df_raw = pd.concat([df["X_test"], df["X_train_raw"]])
# evaluate models that are activated/available
for model_name in config["sentiment"]["active_model"]:
    if thread_utilize:       # creating thread
        t1 = threading.Thread(target=helper_thread, args=(model_name,wrapped_model)) 
        t1.start()
        thread_list.append(t1)
    else:
        helper_thread(model_name,wrapped_model)

=== Started processing for model 'yelp'... === 
Started processing data... (25374 of 25374 samples)
=== Started processing for model 'care'... === 
Started processing data... (25374 of 25374 samples)
=== Started processing for model 'twitter'... === 
Started processing data... (25374 of 25374 samples)


In [6]:
# wait for all threads to terminate
for i in range(len(thread_list)):
    thread_list[i].join()

Sample 250...
Sample 250...
Sample 500...
Sample 500...
Sample 250...
Sample 750...
Sample 750...
Sample 1000...
Sample 1000...
Output error (http://acumos-gpu.research.att.com:8763), exception (500 Server Error: INTERNAL SERVER ERROR for url: http://acumos-gpu.research.att.com:8763/classify)
Sample 500...
Sample 1250...
Sample 1250...
Sample 1500...
Sample 1500...
Sample 750...
Sample 1750...
Sample 1750...
Sample 2000...
Sample 2000...
Sample 2250...
Sample 1000...
Sample 2500...
Sample 2250...
Sample 2750...
Sample 2500...
Sample 1250...
Sample 3000...
Sample 2750...
Sample 3250...
Sample 3000...
Sample 1500...
Sample 3500...
Sample 3750...
Sample 3250...
Sample 4000...
Sample 1750...
Sample 3500...
Sample 4250...
Sample 3750...
Sample 4500...
Sample 2000...
Sample 4000...
Sample 4750...
Sample 5000...
Sample 4250...
Sample 5250...
Sample 2250...
Sample 4500...
Sample 5500...
Sample 4750...
Sample 5750...
Sample 2500...
Sample 5000...
Sample 6000...
Sample 6250...
Sample 5250...
Sam

Sample 14000...
Sample 14250...
Sample 14500...
Sample 14750...
Output error (http://acumos-gpu.research.att.com:8763), exception (500 Server Error: INTERNAL SERVER ERROR for url: http://acumos-gpu.research.att.com:8763/classify)
Sample 15000...
Sample 15250...
Sample 15500...
Sample 15750...
Sample 16000...
Sample 16250...
Sample 16500...
Sample 16750...
Sample 17000...
Sample 17250...
Sample 17500...
Sample 17750...
Sample 18000...
Sample 18250...
Sample 18500...
Sample 18750...
Sample 19000...
Sample 19250...
Sample 19500...
Sample 19750...
Sample 20000...
Sample 20250...
Sample 20500...
Sample 20750...
Sample 21000...
Sample 21250...
Sample 21500...
Sample 21750...
Sample 22000...
Sample 22250...
Sample 22500...
Sample 22750...
Output error (http://acumos-gpu.research.att.com:8763), exception (500 Server Error: INTERNAL SERVER ERROR for url: http://acumos-gpu.research.att.com:8763/classify)
Sample 23000...
Sample 23250...
Sample 23500...
Sample 23750...
Sample 24000...
Sample 24250

In [7]:
print("Combining and writing combined features to output ETL file...")
sentiment["X_test"] = pd.DataFrame([], index=df["X_test"].index)
sentiment["X_train"] = pd.DataFrame([], index=df["X_train_raw"].index)

# now read the processed samples into our main dataframe
for model_name in config["sentiment"]["active_model"]:
    path_sentiment = config["sentiment"][model_name]["path"]
    if os.path.exists(path_sentiment):
        df_read = pd.read_csv(path_sentiment, index_col="row_id")
        sentiment["X_train"] = sentiment["X_train"].join(df_read).fillna(0)
        sentiment["X_test"] = sentiment["X_test"].join(df_read).fillna(0)
        print("Model '{}' read sentiment dimensions {}".format(model_name, df_read.shape))
                
print("Combined training dimensions {}, test dimensions {}".format(sentiment["X_train"].shape, sentiment["X_test"].shape))
with gzip.open(config["path"]["sentiment"], 'wb') as f:
    pickle.dump(sentiment, f)


Combining and writing combined features to output ETL file...
Model 'yelp' read sentiment dimensions (25374, 1)
Model 'care' read sentiment dimensions (25367, 1)
Model 'twitter' read sentiment dimensions (25374, 1)
Combined training dimensions (20299, 3), test dimensions (5075, 3)
