In [1]:
import os
import json
import pickle as pkl
import numpy as np
import pandas as pd
import sklearn.metrics

from analysis_util import read_file, load_all, get_best_trial
from encode_data import Encoder, Mapping

def shuffle_col(df, col, seed=None):
    new_df = df.copy()
    if seed is None: 
        new_df[col] = np.random.permutation(new_df[col])
    else:
        np.random.seed(seed)
        new_df[col] = np.random.permutation(new_df[col])
    return new_df

def evaluate(df, encoder, model):
    y, X_struc, X_text = encoder.transform(df)
    print(X_struc.shape)
    print(X_text.shape)
    preds = model.predict(X_struc, X_text)
    acc = sklearn.metrics.accuracy_score(y, preds)
    return acc


In [2]:
import tensorflow as tf

In [3]:
m = tf.keras.metrics.AUC(num_thresholds=3)
m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
# threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
# tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
# tp_rate = recall = [1, 0.5, 0], fp_rate = [1, 0, 0]
# auc = ((((1+0.5)/2)*(1-0)) + (((0.5+0)/2)*(0-0))) = 0.75
m.result().numpy()

0.75

In [62]:
output_dir = '/Users/cicipan/projects/Amplify-AutoML-Toolkit/demo/outputs/lr_outputs/'
data_file = '/Users/cicipan/projects/Amplify-AutoML-Toolkit/demo/data/raw_data/comb_dev.tsv'
df = read_file(data_file)

In [63]:
best_trial = get_best_trial(output_dir)
model, encoder = load_all(best_trial)

best metric: 0.235, best_trial: /Users/cicipan/projects/Amplify-AutoML-Toolkit/demo/outputs/lr_outputs/model_2


In [64]:
encoder.text_config.mode

'tfidf'

In [65]:
cols = list(df.columns)
cols

['desc_clean', 'desc_len', 'goal', 'category_slug', 'label']

In [66]:
df.head()

Unnamed: 0,desc_clean,desc_len,goal,category_slug,label
0,The RAYGUN GOTHIC ROCKETSHIP is an immers...,19,2500.0,art/conceptual art,0
1,"Decibel Festival is a non-profit, four-day ann...",17,10000.0,art/digital art,0
2,An art project that investigates habitats and ...,21,2000.0,art/sculpture,1
3,I want to give away 300 paintings in three mon...,20,1824.79,art/painting,1
4,Miché Fambro records the long-awaited Jazz Cro...,21,5000.0,music/jazz,0


In [67]:
shuffle_col(df, 'desc_clean').head()

Unnamed: 0,desc_clean,desc_len,goal,category_slug,label
0,The premier heavy metal fest in the USA. Bring...,19,2500.0,art/conceptual art,0
1,Two brothers search a swathe of small town bar...,17,10000.0,art/digital art,0
2,I am 15 years old and have managed to get my m...,21,2000.0,art/sculpture,1
3,STEM video games for non-profits is our Maine ...,20,1824.79,art/painting,1
4,BODYlogue:Taking Space is a powerful dance con...,21,5000.0,music/jazz,0


In [68]:
original_metrics = evaluate(df, encoder, model)
original_metrics

Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)


  y = column_or_1d(y, warn=True)


0.765

In [69]:
original_metrics = evaluate(df, encoder, model)

n_samples = 10

feature_importance_dict = {}

label_col = 'label'
for col in cols:
    if col == label_col:
        continue
    new_metrics = []
    for i in range(n_samples):
        new_df = shuffle_col(df, col, seed=i)
        metric = evaluate(new_df, encoder, model)
        new_metrics.append(metric)
    mean = np.mean(new_metrics)
    feature_importance_dict[col] = original_metrics - mean

Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)
Starting to encode dataset...
Starting to encode structural data...
Except boolean, categorical and text input data after encoding, the shape is (2000, 2)
we have 2 columns.
Non-text input data after encoding, the shape is (2000, 146)
We have 146 columns.
complete encoding part of structural data!
Starting to encode text inputs...
Found 2000 texts.
tfidf X_text shape: (2000, 1000)
complete encoding part of textual data!
(2000, 146)
(2000, 1000)


  y = column_or_1d(y, warn=True)


In [70]:
for col, importance in sorted(feature_importance_dict.items(), key=lambda x: x[-1], reverse=True):
    print('{}: {}'.format(col, importance))

category_slug: 0.0887500000000001
desc_clean: 0.029449999999999976
goal: 0.006399999999999961
desc_len: 0.005300000000000082
