In [26]:
# Import Libraries

import os
import pandas as pd
import joblib
import io
import json

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer


In [27]:
# Load column names, imputer e model

colname_path = os.environ['COLNAME_PATH']
colnames = joblib.load(colname_path) 

imputer_path = os.environ['IMPUTER_PATH']
imputer = joblib.load(imputer_path) 

model_path = os.environ['MODEL_PATH']
clf = joblib.load(model_path) 



In [28]:
# Create Test Data JSON & Transform it into a Pandas dataframe

json_input = """
{
  "products": [
    {
      "title": "Lembrancinha"
    },
    {
      "title": "Carrinho de Bebê"
    }
  ]
}"""

json_dict = json.loads(json_input)
json_dict = json_dict['products']
json_string = json.dumps(json_dict)

test_data = pd.read_json(io.StringIO(json_string),orient="records")

test_data


Unnamed: 0,title
0,Lembrancinha
1,Carrinho de Bebê


In [29]:
# Create a transformation function

def generate_word_count_frame(column):
    # Limit the verctorizer to the 1.000 most popular words (for memory & speed reasons)
    cv = CountVectorizer(max_features=1000)

    column = column.fillna('')
    tf = cv.fit_transform(column)

    word_count_frame = pd.DataFrame(tf.toarray(), columns=cv.get_feature_names())
    word_count_frame.columns = [column.name+'_'+str(col) for col in word_count_frame.columns]
    
    return word_count_frame   


In [30]:
# Transform title into word_counts 

word_counts = generate_word_count_frame(test_data['title'])
test_data = test_data.drop(columns=['title'])
test_data = test_data.join(word_counts)


test_data

Unnamed: 0,title_bebê,title_carrinho,title_de,title_lembrancinha
0,0,0,0,1
1,1,1,1,0


In [31]:
# Build input data with same shape as the training data

input_data = pd.DataFrame(index=test_data.index)

for col in colnames:
    if col in test_data.columns:
        input_data = input_data.join(test_data[col])
    else:
        input_data = input_data.join(pd.DataFrame({col:[None for val in range(len(test_data))]}))

input_data


Unnamed: 0,concatenated_tags_02,concatenated_tags_10,concatenated_tags_100,concatenated_tags_12,concatenated_tags_15,concatenated_tags_15anos,concatenated_tags_18,concatenated_tags_18k,concatenated_tags_1a,concatenated_tags_2018,...,title_vovó,title_vovô,title_xícara,title_água,title_álbum,title_álcool,title_árvore,title_ímã,view_counts,weight
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,


In [32]:
# Use imputer in order to remove NULL values

input_data = imputer.transform(input_data)


In [33]:
# Predict classification

clf.predict(input_data)


array(['Lembrancinhas', 'Bebê'], dtype=object)