In [32]:
%reload_ext autoreload
%autoreload 2
import os
import json
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn.metrics
from modeling import get_model_cls
from analysis_util import load_encoder, read_file, load_all, get_best_trial, load_model

# def shuffle_col(df, col, seed=None):
#     new_df = df.copy()
#     if seed is None:
#         new_df[col] = np.random.permutation(new_df[col])
#     else:
#         np.random.seed(seed)
#         new_df[col] = np.random.permutation(new_df[col])
#     return new_df


In [33]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import time
import collections
import argparse
import os
import json
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from pickle import dump


## check the package version
# print(pd.__version__)

# # Preprocess data steps:

# 1. split dataset
# 2. transfer datetime data
# 3. encode categorical data
# 4. encode boolean type data
# 5. normalize data

def main():
    parser = argparse.ArgumentParser()

    # # parameters for select input data and metedata configure files
    # parser.add_argument('--data_dir', type=str,
    #     # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
    #     help=('directory to load the raw data.'))

    # parser.add_argument('--data_name', type=str,
    #     # default='kickstarter',
    #     help=('which data will be used? (kickstarter Or indiegogo?)'))

    parser.add_argument('--metadata_file', type=str,
                        # default='metadata.json',
                        help=('which tabular metadata file will be used?'))

    parser.add_argument('--train_file', type=str,
                        help=('which train file will be used?'))

    parser.add_argument('--dev_file', type=str,
                        help=('which dev file will be used?'))

    parser.add_argument('--test_file', type=str,
                        # must passing test file when using BERT model,
                        help=('which test file will be used?'))

    # parameter for using text features
    parser.add_argument('--use_text_features', type=str2bool, nargs='?',
                        const=True, default=False,
                        help=('whether encode the text features or not?'))

    parser.add_argument('--encode_text_with', type=str,
                        # default='tfidf',
                        help=('how to encode the text features? (tfidf, glove)'))

    parser.add_argument('--glove_file', type=str,
                        # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
                        help=('directory to the GloVe file will be used. (e.g. glove.840B.300d.txt)'))

    parser.add_argument('--max_words', type=int,
                        # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
                        help=('what is the maximum number of words for encoding text?'))

    parser.add_argument('--max_sequence_length', type=int,
                        # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
                        help=('what is the maximum sequence length for encoding text?'))

    parser.add_argument('--output_dir', type=str,
                        # default='/data/home/t-chepan/projects/MS-intern-project/raw_data',
                        help=('directory to save the encoded data.'))

    args = parser.parse_args()

    ### load raw data and related metadata configure file
    # if args.data_name is not None and args.data_dir is not None:
    #     path_to_data = os.path.join(args.data_dir, args.data_name)
    #     path_to_save = os.path.join(args.output_dir, args.data_name)
    #     if not os.path.exists(path_to_save):
    #         os.makedirs(path_to_save)

    # elif args.data_name is None and args.data_dir is not None:
    #     path_to_data = args.data_dir
    #     path_to_save = args.output_dir

    # else:
    #     raise argparse.ArgumentTypeError(args.data_name + ' or ' + args.data_dir + " can't be recognized.")

    # if not os.path.exists(path_to_data):
    #     os.makedirs(path_to_data)

    # if not os.path.exists(path_to_save):
    #     os.makedirs(path_to_save)

    # train_path = os.path.join(path_to_data, args.train_file)
    # dev_path = os.path.join(path_to_data, args.dev_file)
    # test_path = os.path.join(path_to_data, args.test_file)

    print("Start to load data...")

    path_to_save = args.output_dir
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)

    df_train = read_file(args.train_file)
    df_dev = read_file(args.dev_file)
    df_test = read_file(args.test_file)

    print('*' * 50)
    print('training set size is {}'.format(df_train.shape[0]))
    print('dev set size is {}'.format(df_dev.shape[0]))
    print('test set size is {}'.format(df_test.shape[0]))

    with open(args.metadata_file, 'r') as f:
        metadata = json.load(f)

    print("Processing data...")

    if args.use_text_features:
        mode = args.encode_text_with
        text_config = Mapping()
        text_config.mode = mode
        text_config.max_words = args.max_words

        if mode == 'glove':
            # glove_file_path = os.path.join(args.glove_dir, args.glove_file)
            text_config.maxlen = args.max_sequence_length
            text_config.embeddings_index = open_glove(args.glove_file)
            text_config.embedding_dim = list(text_config.embeddings_index.values())[0].shape[-1]

        if mode != 'glove' and mode != 'tfidf':
            raise argparse.ArgumentTypeError(mode, "can't be recognized.")

    else:
        text_config = None

    encoder = Encoder(metadata, text_config)

    y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train)
    y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev)
    y_test, X_test_struc, X_test_text = encoder.transform(df_test)

    if encoder.text_config is not None and encoder.text_config.mode == 'glove':
        f_path = os.path.join(path_to_save, 'embedding_matrix.npy')
        text_config.embedding_matrix_path = f_path
        with open(f_path, 'wb') as f:
            np.save(f, encoder.text_config.embedding_matrix)
        del encoder.text_config.embedding_matrix

    path = os.path.join(path_to_save, 'encoder.pkl')
    dump(encoder, open(path, 'wb'))

    metadata_path = os.path.join(path_to_save, 'metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)

    if text_config is not None:
        text_config_path = os.path.join(path_to_save, 'text_config.json')
        with open(text_config_path, 'w') as f:
            json.dump(encoder.text_config, f, indent=4)

    ### save the encoded data ###
    output_list = [y_train, X_train_struc, X_train_text, y_dev, X_dev_struc,
                   X_dev_text, y_test, X_test_struc, X_test_text]
    path_name_list = ['y_train', 'X_train_struc', 'X_train_text', 'y_dev', 'X_dev_struc',
                      'X_dev_text', 'y_test', 'X_test_struc', 'X_test_text']

    for i, e in enumerate(output_list):
        if e is not None:
            e_path = os.path.join(path_to_save, '{}.npy'.format(path_name_list[i]))
            np.save(e_path, e)

    print('Saved the encoded text inputs!')


def read_file(path):
    filename, file_extension = os.path.splitext(path)
    if file_extension == '.csv':
        sep = ','
    elif file_extension == '.tsv':
        sep = '\t'
    else:
        raise ValueError('Unknown type of file: {}. Please add .csv or .tsv.'.format(path))
    df = pd.read_csv(path, sep=sep)
    return df


## use dict like object
class Mapping(dict):

    def __getattr__(self, name):
        if name in self:
            return self[name]
        else:
            raise AttributeError("No such attribute: " + name)

    def __setattr__(self, name, value):
        self[name] = value

    def __delattr__(self, name):
        if name in self:
            del self[name]
        else:
            raise AttributeError("No such attribute: " + name)


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')


def contain_nontext_features(metadata):
    n_dtype = len(metadata.keys())

    if n_dtype == 1 and 'input_text' in metadata.keys():
        return False
    else:
        return True


def separate_input_output_cols(df, metadata):
    """According to the metadata, separate the input features, output features and
        different types of input features.

    Args:
      df: a DataFrame that stores the raw data.
      metadata: a dictionary that stores the detail description for features.
        metadata = {
        'output_type': 'classes' for classification task (or it can be 'numbers' for regression task)
        'input_features': ['TenantId','CreatedDate', ...],
        'output_label': ['AR_exchange_06','AR_sharepoint_06', ...],
        'input_bool': ['HasEXO','HasSPO', ...],
        'input_categorical': ['CountryCode', 'Languange', ...],
        'input_datetime': ['CreatedDate', ...],
        'input_int': [...] ,
        'input_float': [...]
        }
    Returns:
      df_y: a DataFrame that stores the output labels
      df_X_text: a DataFrame that stores the textual input
      df_X_float: a DataFrame that stores the float inputs
      df_X_int: a DataFrame that stores the integer inputs
      df_X_cat: a DataFrame that stores the categorical inputs
      df_X_datetime: a DataFrame that stores the datetime inputs
      df_X_bool: a DataFrame that stores the boolean inputs

    """
    # input_cols = metadata['input_features']
    output_cols = metadata['output_label']
    input_text_cols = metadata['input_text']
    input_float_cols = metadata['input_float']
    input_int_cols = metadata['input_int']
    input_cat_cols = metadata['input_categorical']
    input_datetime_cols = metadata['input_datetime']
    input_bool_cols = metadata['input_bool']

    df_y = df.loc[:, output_cols]
    df_X_text = df.loc[:, input_text_cols]
    df_X_float = df.loc[:, input_float_cols]
    df_X_int = df.loc[:, input_int_cols]
    df_X_cat = df.loc[:, input_cat_cols]
    df_X_datetime = df.loc[:, input_datetime_cols]
    df_X_bool = df.loc[:, input_bool_cols]

    return df_y, df_X_text, df_X_float, df_X_int, df_X_cat, df_X_datetime, df_X_bool


def encode_datetime(df_X_datetime):
    """Encode the datetime inputs from '2/5/2014 5:31:19 AM' format
        to a numerical number of UTC format.

    Args:
      df_: a DataFrame that only stores the datetime inputs.

    Returns:
      X_datetime: a numpy array that contains the encoded datetime inputs.
      datetime_cols: a list that contains the datetime colunms name.

    """

    cols = df_X_datetime.columns
    for i in cols:
        df_X_datetime[i] = pd.to_datetime(df_X_datetime[i], utc=True,
                                          errors='coerce').astype(int, errors='ignore')

    X_datetime = df_X_datetime.to_numpy()

    return X_datetime


def encode_bool(df_X_bool):
    """Encode the numerical and boolean inputs.

    Args:
      df_X_bool: a DataFrame that stores the boolean inputs

    Returns:
      X_bool: a numpy array that contains the encoded boolean inputs.

    """
    X_bool = df_X_bool.astype(int).to_numpy()
    return X_bool


def encode_num(df_X_num):
    """Encode the numerical and boolean inputs.

    Args:
      df_X_num: a DataFrame that stores the numerical inputs

    Returns:
      X_num: a numpy array that contains the float inputs.

    """
    X_num = df_X_num.to_numpy()
    return X_num


def encode_y(metadata, df_y, y_encoder):
    if metadata['output_type'] == 'classes':
        # encode class values as integers
        y_arr = df_y.values
        if y_encoder is None:
            y_encoder = LabelEncoder()
            y = y_encoder.fit_transform(y_arr)
        else:
            y = y_encoder.transform(y_arr)

        if len(y_encoder.classes_) > 2:
            # convert integers to dummy variables (i.e. one hot encoded)
            y = np_utils.to_categorical(y)

    elif metadata['output_type'] == 'numbers':
        y = df_y.to_numpy()
        y_encoder = None

    else:
        raise ValueError('Unknown type of output: {}'.format(metadata['output_type']))

    return y, y_encoder


def encode_strucdata(metadata, df_X_float, df_X_int, df_X_cat, df_X_datetime, df_X_bool, vectorizer, scaler):
    """Encode the meta data part in dataset, such as numerical and categorical data.

    """
    print('Starting to encode structural data...')

    # df_y, _, df_X_float, df_X_int, df_X_cat, df_X_datetime, df_X_bool = separate_input_output_cols(df, metadata)

    X_list = []
    cols_name = []

    if df_X_float.shape[1] > 0:
        X_float = encode_num(df_X_float)
        X_list.append(X_float)
        cols_name += metadata['input_float']

    if df_X_int.shape[1] > 0:
        X_int = encode_num(df_X_int)
        X_list.append(X_int)
        cols_name += metadata['input_int']

    if df_X_datetime.shape[1] > 0:
        X_datetime = encode_datetime(df_X_datetime)
        X_list.append(X_datetime)
        cols_name += metadata['input_datetime']

    if X_list:
        ### normalize all the inputs except boolean, categorical, and text features
        X_arr = np.concatenate(X_list, axis=1)

        if scaler == None:
            scaler = StandardScaler()
            X_struc = scaler.fit_transform(X_arr)
        else:
            X_struc = scaler.transform(X_arr)
        assert len(cols_name) == X_struc.shape[1]
        print('Except boolean, categorical and text input data after encoding, the shape is {}'.format(X_struc.shape))
        print('we have {} columns.'.format(len(cols_name)))
    else:
        X_struc = None

    ### encode boolean columns
    if df_X_bool.shape[1] > 0:
        X_bool = encode_bool(df_X_bool)
        cols_name += metadata['input_bool']
        if X_struc is None:
            X_struc = X_bool
        else:
            X_struc = np.concatenate([X_struc, X_bool], axis=1)

    ### encode the categorical columns
    if df_X_cat.shape[1] > 0:
        X_cat_dict = df_X_cat.to_dict(orient='records')

        if vectorizer == None:
            vectorizer = DictVectorizer(sparse=False)
            X_cat = vectorizer.fit_transform(X_cat_dict)

        else:
            X_cat = vectorizer.transform(X_cat_dict)

        vocab = vectorizer.vocabulary_
        vocab_od = collections.OrderedDict(sorted(vocab.items(), key=lambda x: x[1]))
        cat_encoded_cols = list(vocab_od.keys())
        cols_name += cat_encoded_cols
        if X_struc is None:
            X_struc = X_cat
        else:
            X_struc = np.concatenate([X_struc, X_cat], axis=1)

    assert len(cols_name) == X_struc.shape[1]
    print('Non-text input data after encoding, the shape is {}'.format(X_struc.shape))
    print('We have {} columns.'.format(len(cols_name)))

    return X_struc, vectorizer, scaler


def open_glove(glove_file_path):
    print('Indexing word vectors.')

    embeddings_index = {}
    f = open(glove_file_path, encoding="utf8")
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index


def encode_textdata(df_X_text, tokenizer, mode, max_words, maxlen):
    ## encode text columns, encoded text features should not be normalized.

    print('Starting to encode text inputs...')

    texts = df_X_text.iloc[:, 0].values.astype('U')
    print('Found %s texts.' % len(texts))

    if mode == 'tfidf':
        if tokenizer is None:
            tokenizer = Tokenizer(num_words=max_words)
            tokenizer.fit_on_texts(texts)
        X_text = tokenizer.texts_to_matrix(texts, mode='tfidf')
        print('tfidf X_text shape: {}'.format(X_text.shape))

    elif mode == 'glove':
        # vectorize the text samples into a 2D integer tensor
        if tokenizer is None:
            tokenizer = Tokenizer(num_words=max_words, oov_token='<UNK>')
            tokenizer.fit_on_texts(texts)
            tokenizer.word_index = {e: i for e, i in tokenizer.word_index.items() if i <= max_words}
            # tokenizer.word_index[tokenizer.oov_token] = max_words + 1

        sequences = tokenizer.texts_to_sequences(texts)

        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

        X_text = pad_sequences(sequences, maxlen=maxlen, padding='post')
    else:
        raise ValueError('Unknown text processing mode: {}'.format(mode))

    return X_text, tokenizer  ### need to save embedding_matrix as well


def encode_dataset(df, metadata, y_encoder=None, vectorizer=None, scaler=None, tokenizer=None, mode=None,
                   max_words=None, maxlen=None):
    print('Starting to encode dataset...')

    df_y, df_X_text, df_X_float, df_X_int, df_X_cat, df_X_datetime, df_X_bool = separate_input_output_cols(df, metadata)

    y, y_encoder = encode_y(metadata, df_y, y_encoder)

    # check if exist non-text data
    if df_X_float.shape[1] + df_X_int.shape[1] + df_X_cat.shape[1] + df_X_datetime.shape[1] + df_X_bool.shape[1] > 0:
        X_struc, vectorizer, scaler = encode_strucdata(metadata, df_X_float, df_X_int, df_X_cat, df_X_datetime,
                                                       df_X_bool, vectorizer, scaler)
    else:
        X_struc, vectorizer, scaler = None, None, None

    print("complete encoding part of structural data!")

    if not metadata['input_text'] or mode == None:
        X_text, tokenizer = None, None
    else:
        X_text, tokenizer = encode_textdata(df_X_text, tokenizer, mode, max_words, maxlen)

    print("complete encoding part of textual data!")
    return y, y_encoder, X_struc, X_text, vectorizer, scaler, tokenizer


class Encoder(object):

    def __init__(self, metadata, text_config):
        self.text_config = text_config
        self.metadata = metadata
        self.has_nontext = contain_nontext_features(metadata)

    def fit_transform(self, df):
        if self.has_nontext and self.text_config is None:
            y, self.y_encoder, X_struc, X_text, self.vectorizer, self.scaler, _ = encode_dataset(df, self.metadata,
                                                                                                 mode=None)

        elif self.text_config.mode == 'tfidf':
            y, self.y_encoder, X_struc, X_text, self.vectorizer, self.scaler, self.tokenizer = encode_dataset(
                df, self.metadata, mode='tfidf', max_words=self.text_config.max_words)

        elif self.text_config.mode == 'glove':
            y, self.y_encoder, X_struc, X_text, self.vectorizer, self.scaler, self.tokenizer = encode_dataset(
                df, self.metadata, mode='glove', max_words=self.text_config.max_words, maxlen=self.text_config.maxlen)

            word_index = self.tokenizer.word_index
            # prepare embedding matrix
            embedding_matrix = np.zeros((len(word_index) + 1, self.text_config.embedding_dim))
            for word, i in word_index.items():
                embedding_vector = self.text_config.embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be all-zeros.
                    embedding_matrix[i] = embedding_vector
            self.text_config.embedding_matrix = embedding_matrix
            del self.text_config.embeddings_index

        else:
            raise ValueError('Unknown type of text_config: {}'.format(self.text_config.mode))

        return y, X_struc, X_text

    def transform(self, df):
        if self.text_config is None:
            y, _, X_struc, X_text, _, _, _ = encode_dataset(df, self.metadata, y_encoder=self.y_encoder,
                                                            vectorizer=self.vectorizer, scaler=self.scaler)

        elif self.text_config.mode == 'tfidf':

            y, _, X_struc, X_text, _, _, _ = encode_dataset(
                df, self.metadata, y_encoder=self.y_encoder,
                vectorizer=self.vectorizer, scaler=self.scaler, tokenizer=self.tokenizer, mode='tfidf',
                max_words=self.text_config.max_words)
        elif self.text_config.mode == 'glove':
            y, _, X_struc, X_text, _, _, _ = encode_dataset(
                df, self.metadata, y_encoder=self.y_encoder,
                vectorizer=self.vectorizer, scaler=self.scaler, tokenizer=self.tokenizer,
                mode='glove', max_words=self.text_config.max_words, maxlen=self.text_config.maxlen)
        else:
            raise ValueError('Unknown type of text_config: {}'.format(self.text_config.mode))

        return y, X_struc, X_text


if __name__ == '__main__':
    main()


usage: ipykernel_launcher.py [-h] [--metadata_file METADATA_FILE]
                             [--train_file TRAIN_FILE] [--dev_file DEV_FILE]
                             [--test_file TEST_FILE]
                             [--use_text_features [USE_TEXT_FEATURES]]
                             [--encode_text_with ENCODE_TEXT_WITH]
                             [--glove_file GLOVE_FILE] [--max_words MAX_WORDS]
                             [--max_sequence_length MAX_SEQUENCE_LENGTH]
                             [--output_dir OUTPUT_DIR]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\chenchenpan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\Roaming\jupyter\runtime\kernel-3d173a3b-40c5-4a7d-b00a-9f7fcb61a83f.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [61]:
output_dir = 'recommend_V2_NN/outputs/nn_outputs/'
data_file = 'recommend_V2_NN/data/raw_data/tenant_data_featurized.csv'

# output_dir = 'recommend_V3/outputs_1031/nn_outputs/'
# data_file = 'recommend_V3/data/raw_data/tenant_data_featurized_joint.csv'
df = read_file(data_file)

In [62]:
df.head()

Unnamed: 0,TenantId,Age,Age Group,AreaName,CountryCode,IndustryName,TotalUsers,TotalUsersWithSkuAssigned,SubscriptionsCount,EXOEnabledUsers,...,PowerPoint_AllUp,OneNote_AllUp,Outlook_AllUp,EXO_AllUp,SPO_AllUp,OD4B_AllUp,Teams_AllUp,SfB_AllUp,SkypeTeams_AllUp,WordExcel_AllUp
0,f431229b-d3d4-482f-99de-949a81db0616,1456,<10yr,APAC,KOR,Mining,7,6,2,0,...,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
1,0005e37c-da1d-49a3-9411-87f64edb84e1,1475,<10yr,United States,USA,Others,7,2,2,2,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
2,f43134df-b1e9-4162-a41a-b6253acdad2f,1406,<10yr,Western Europe,BEL,Other Partner Prof Services,3,1,2,0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,000609e0-8e9c-40f8-83d3-5de84bac7aff,675,<3yr,UK,GBR,Others,3,2,1,2,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
4,f4314553-921a-447d-be35-8cdf761dd6c4,1650,<10yr,United States,USA,Others,5,2,2,2,...,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,1.0


In [63]:
df.shape

(1539331, 49)

In [64]:
best_trial = get_best_trial(output_dir)
# model, encoder = load_all(best_trial)

best metric: 0.009605477564036846, best_trial: recommend_V2_NN/outputs/nn_outputs/model_3


In [65]:
print(best_trial)

recommend_V2_NN/outputs/nn_outputs/model_3


In [66]:
model_config_file = os.path.join(best_trial, 'model_config.json')

In [67]:
with open(model_config_file, 'r') as f:
    model_config = json.load(f)

In [68]:
encoder_path = 'recommend_V2_NN/data/encoded_data/encoder.pkl'
file = open(encoder_path, 'rb')

# with open(encoder_path, 'rb') as f:
#     encoder = pkl.load(f)

In [69]:
encoder = pkl.load(file)

In [70]:
model = get_model_cls(model_config['model_type'])(encoder.text_config, model_config)
model.load(best_trial)

In [71]:
model_config

{'model_type': 'mlp',
 'output_dir': '/home/chenchenpan/Projects/AutoML-Toolkit/recommend_NewTenant/outputs/nn_outputs/model_3',
 'task_type': 'regression',
 'num_classes': 8,
 'combine': 'concate',
 'n_layers_dense': 2,
 'hidden_size_dense': 48,
 'n_layers_lstm': 2,
 'hidden_size_lstm': 32,
 'dropout_rate_lstm': 0.0,
 'n_layers_output': 2,
 'hidden_size_output': 30,
 'optimizer': 'adam',
 'learning_rate': 0.0017137964331140918,
 'clipnorm': 5.0,
 'patience': 5,
 'n_epochs': 93,
 'batch_size': 128,
 'verbose': 0,
 'encoded_data_dir': '/home/chenchenpan/Projects/AutoML-Toolkit/recommend_NewTenant/data/encoded_data'}

In [72]:
y, X, _ = encoder.transform(df)

Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (1539331, 16)
we have 16 columns.
Non-text input data after encoding, the shape is (1539331, 269)
We have 269 columns.
complete encoding part of structural data!
complete encoding part of textual data!


In [73]:
%time pred = model.predict(X, output_dir=best_trial)

Wall time: 39.8 s


In [74]:
pred.shape

(1539331, 8)

In [75]:
acc = sklearn.metrics.mean_squared_error(y, pred)

In [76]:
acc

0.0776622018421674

## Calculate the rough Coverage rate (workload based)

In [None]:
delta = pred - y

In [None]:
np.sum(delta > 0.1) / (delta.shape[0] * delta.shape[1])

In [54]:
prediction_path = os.path.join(best_trial, 'predictions.npy')
with open(prediction_path, 'rb') as f:
    predictions = np.load(f)

predictions.shape

(1539331, 8)

In [55]:
predictions[:10]

array([[ 5.6258410e-01,  4.8001297e-02,  1.2509432e-01,  2.4061067e-01,
         2.8289899e-01,  3.8563248e-02,  8.3149374e-02,  8.6173630e-01],
       [ 3.6770325e-02, -9.9395588e-04, -9.5082819e-03,  1.0196586e+00,
         6.2753841e-02,  5.7511833e-02, -8.3401799e-03,  9.2455888e-01],
       [ 8.5995263e-01,  2.2960518e-02,  8.9035857e-01,  3.9760172e-01,
         7.9427135e-01, -1.0074703e-03,  5.5240393e-02,  8.3288765e-01],
       [ 3.2278907e-02, -5.8760606e-03, -9.9389777e-03,  1.0331113e+00,
         7.2634585e-02, -8.1720082e-03,  4.7505319e-02,  9.3563837e-01],
       [ 2.1419208e-02,  4.7179945e-03,  9.2222083e-01,  9.8093635e-01,
         8.5940027e-01, -4.4819890e-03, -2.3044020e-02,  9.3730634e-01],
       [ 1.4560264e-02, -1.8729735e-02,  6.0159671e-01,  9.1402030e-01,
         7.6716095e-02, -8.2609830e-03,  1.4683819e-01, -6.5230384e-02],
       [ 4.4051688e-02, -5.1864646e-03,  4.5810994e-03,  8.0772245e-01,
         8.5491222e-01, -2.5453595e-03,  9.7830987e-01,  8

In [57]:
pred[:10]

array([[ 5.6258410e-01,  4.8001297e-02,  1.2509432e-01,  2.4061067e-01,
         2.8289899e-01,  3.8563248e-02,  8.3149374e-02,  8.6173630e-01],
       [ 3.6770325e-02, -9.9395588e-04, -9.5082819e-03,  1.0196586e+00,
         6.2753841e-02,  5.7511833e-02, -8.3401799e-03,  9.2455888e-01],
       [ 8.5995263e-01,  2.2960518e-02,  8.9035857e-01,  3.9760172e-01,
         7.9427135e-01, -1.0074703e-03,  5.5240393e-02,  8.3288765e-01],
       [ 3.2278907e-02, -5.8760606e-03, -9.9389777e-03,  1.0331113e+00,
         7.2634585e-02, -8.1720082e-03,  4.7505319e-02,  9.3563837e-01],
       [ 2.1419208e-02,  4.7179945e-03,  9.2222083e-01,  9.8093635e-01,
         8.5940027e-01, -4.4819890e-03, -2.3044020e-02,  9.3730634e-01],
       [ 1.4560264e-02, -1.8729735e-02,  6.0159671e-01,  9.1402030e-01,
         7.6716095e-02, -8.2609830e-03,  1.4683819e-01, -6.5230384e-02],
       [ 4.4051688e-02, -5.1864646e-03,  4.5810994e-03,  8.0772245e-01,
         8.5491222e-01, -2.5453595e-03,  9.7830987e-01,  8

In [58]:
np.sum(pred - predictions)

0.0

In [97]:
coverage_file_path = os.path.join('recommend_V2_NN/outputs', 'coverage_calculate_1231.csv')

In [86]:
y_column_name = [
    "TenantId",
    "PowerPoint_MAR",
    "OneNote_MAR",
    "Outlook_MAR",
    "EXO_MAR",
    "OD4B_MAR",
    "SPO_MAR",
    "SkypeTeams_MAR",
    "WordExcel_MAR"]
pred_column_name = [
    "PowerPoint_MAR_pred",
    "OneNote_MAR_pred",
    "Outlook_MAR_pred",
    "EXO_MAR_pred",
    "OD4B_MAR_pred",
    "SPO_MAR_pred",
    "SkypeTeams_MAR_pred",
    "WordExcel_MAR_pred"
]

In [85]:
pred_df = pd.DataFrame(predictions, columns=pred_column_name)
pred_df['TenantId'] = df.loc[:,'TenantId'].copy()
pred_df.head()

Unnamed: 0,PowerPoint_MAR_pred,OneNote_MAR_pred,Outlook_MAR_pred,EXO_MAR_pred,OD4B_MAR_pred,SPO_MAR_pred,SkypeTeams_MAR_pred,WordExcel_MAR_pred,TenantId
0,0.562584,0.048001,0.125094,0.240611,0.282899,0.038563,0.083149,0.861736,f431229b-d3d4-482f-99de-949a81db0616
1,0.03677,-0.000994,-0.009508,1.019659,0.062754,0.057512,-0.00834,0.924559,0005e37c-da1d-49a3-9411-87f64edb84e1
2,0.859953,0.022961,0.890359,0.397602,0.794271,-0.001007,0.05524,0.832888,f43134df-b1e9-4162-a41a-b6253acdad2f
3,0.032279,-0.005876,-0.009939,1.033111,0.072635,-0.008172,0.047505,0.935638,000609e0-8e9c-40f8-83d3-5de84bac7aff
4,0.021419,0.004718,0.922221,0.980936,0.8594,-0.004482,-0.023044,0.937306,f4314553-921a-447d-be35-8cdf761dd6c4


In [91]:
y_df = df.loc[:, y_column_name].copy()
y_df.head()

Unnamed: 0,TenantId,PowerPoint_MAR,OneNote_MAR,Outlook_MAR,EXO_MAR,OD4B_MAR,SPO_MAR,SkypeTeams_MAR,WordExcel_MAR
0,f431229b-d3d4-482f-99de-949a81db0616,0.5,0.0,0.0,0.0,0.16,0.0,0.0,0.83
1,0005e37c-da1d-49a3-9411-87f64edb84e1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,f43134df-b1e9-4162-a41a-b6253acdad2f,0.5,0.0,0.5,0.0,0.5,0.0,0.0,0.5
3,000609e0-8e9c-40f8-83d3-5de84bac7aff,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,f4314553-921a-447d-be35-8cdf761dd6c4,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0


In [92]:
coverage_df = y_df.merge(pred_df, how='inner', on='TenantId')
coverage_df.head()

Unnamed: 0,TenantId,PowerPoint_MAR,OneNote_MAR,Outlook_MAR,EXO_MAR,OD4B_MAR,SPO_MAR,SkypeTeams_MAR,WordExcel_MAR,PowerPoint_MAR_pred,OneNote_MAR_pred,Outlook_MAR_pred,EXO_MAR_pred,OD4B_MAR_pred,SPO_MAR_pred,SkypeTeams_MAR_pred,WordExcel_MAR_pred
0,f431229b-d3d4-482f-99de-949a81db0616,0.5,0.0,0.0,0.0,0.16,0.0,0.0,0.83,0.562584,0.048001,0.125094,0.240611,0.282899,0.038563,0.083149,0.861736
1,0005e37c-da1d-49a3-9411-87f64edb84e1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.03677,-0.000994,-0.009508,1.019659,0.062754,0.057512,-0.00834,0.924559
2,f43134df-b1e9-4162-a41a-b6253acdad2f,0.5,0.0,0.5,0.0,0.5,0.0,0.0,0.5,0.859953,0.022961,0.890359,0.397602,0.794271,-0.001007,0.05524,0.832888
3,000609e0-8e9c-40f8-83d3-5de84bac7aff,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.032279,-0.005876,-0.009939,1.033111,0.072635,-0.008172,0.047505,0.935638
4,f4314553-921a-447d-be35-8cdf761dd6c4,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.021419,0.004718,0.922221,0.980936,0.8594,-0.004482,-0.023044,0.937306


In [93]:
coverage_df.shape

(1539331, 17)

In [101]:
coverage_df.to_csv(coverage_file_path, index=False)

In [102]:
print(coverage_file_path)

recommend_V2_NN/outputs\coverage_calculate_1231.csv


In [103]:
test_df = pd.read_csv(coverage_file_path)
test_df.head()

Unnamed: 0,TenantId,PowerPoint_MAR,OneNote_MAR,Outlook_MAR,EXO_MAR,OD4B_MAR,SPO_MAR,SkypeTeams_MAR,WordExcel_MAR,PowerPoint_MAR_pred,OneNote_MAR_pred,Outlook_MAR_pred,EXO_MAR_pred,OD4B_MAR_pred,SPO_MAR_pred,SkypeTeams_MAR_pred,WordExcel_MAR_pred
0,f431229b-d3d4-482f-99de-949a81db0616,0.5,0.0,0.0,0.0,0.16,0.0,0.0,0.83,0.562584,0.048001,0.125094,0.240611,0.282899,0.038563,0.083149,0.861736
1,0005e37c-da1d-49a3-9411-87f64edb84e1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.03677,-0.000994,-0.009508,1.019659,0.062754,0.057512,-0.00834,0.924559
2,f43134df-b1e9-4162-a41a-b6253acdad2f,0.5,0.0,0.5,0.0,0.5,0.0,0.0,0.5,0.859953,0.022961,0.890359,0.397602,0.794271,-0.001007,0.05524,0.832888
3,000609e0-8e9c-40f8-83d3-5de84bac7aff,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.032279,-0.005876,-0.009939,1.033111,0.072635,-0.008172,0.047505,0.935638
4,f4314553-921a-447d-be35-8cdf761dd6c4,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.021419,0.004718,0.922221,0.980936,0.8594,-0.004482,-0.023044,0.937306
