In [1]:
!pip install modAL

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
import csv
import time
import pickle
import pandas as pd
import numpy as np
from functools import partial

import tensorflow as tf
from tensorflow import keras

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from modAL.models import ActiveLearner
from modAL.uncertainty import *
from modAL.batch import uncertainty_batch_sampling

import matplotlib.pyplot as plt
%matplotlib inline

import warnings

In [3]:
filePathName = './RandomForest.pkl'
loaded_model = pickle.load(open(filePathName, 'rb'))



In [4]:
useStats = 1
useAttributeName = 1
useSample1 = 0
useSample2 = 0
## Using descriptive stats and attribute name

In [5]:
dict_label = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

In [6]:
def ProcessStats(data,y):

    data1 = data[['total_vals', 'num_nans', '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val', 'max_val','has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
       'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
       'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
       'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
       'is_list', 'is_long_sentence']]
    data1 = data1.reset_index(drop=True)
    data1 = data1.fillna(0)

    y.y_act = y.y_act.astype(float)
    
    return data1

vectorizerName = CountVectorizer(ngram_range=(2, 2), analyzer='char')
vectorizerSample = CountVectorizer(ngram_range=(2, 2), analyzer='char')

def FeatureExtraction(data,data1,flag):

    arr = data['Attribute_name'].values
    arr = [str(x) for x in arr]
    
    arr1 = data['sample_1'].values
    arr1 = [str(x) for x in arr1]
    arr2 = data['sample_2'].values
    arr2 = [str(x) for x in arr2]
    arr3 = data['sample_3'].values
    arr3 = [str(x) for x in arr3]    
    print(len(arr1),len(arr2))
    if flag:
        X = vectorizerName.fit_transform(arr)
        X1 = vectorizerSample.fit_transform(arr1)
        X2 = vectorizerSample.transform(arr2)   
    else:
        X = vectorizerName.transform(arr)
        X1 = vectorizerSample.transform(arr1)
        X2 = vectorizerSample.transform(arr2)        
        
#     print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

    attr_df = pd.DataFrame(X.toarray())
    sample1_df = pd.DataFrame(X1.toarray())
    sample2_df = pd.DataFrame(X2.toarray())
    print(len(data1),len(attr_df),len(sample1_df),len(sample2_df))

    if useSample1: data2 = sample1_df
    if useSample2: data2 = sample2_df    
    
    data2 = pd.concat([data1, attr_df], axis=1, sort=False)
    print(len(data2))
    return data2



In [7]:
def get_data(sim_size = 0.9):
    """
    sim_size is the % of training data that goes into simulation set.
    """
    xtrain = pd.read_csv('./data_train.csv')
    xtest = pd.read_csv('./data_test.csv')

    y_train = xtrain.loc[:,['y_act']]
    y_test = xtest.loc[:,['y_act']]
    y_train['y_act'] = [dict_label[i] for i in y_train['y_act']]
    y_test['y_act'] = [dict_label[i] for i in y_test['y_act']]
    
    xtrain1 = ProcessStats(xtrain,y_train)
    xtest1 = ProcessStats(xtest,y_test)


    X_train = FeatureExtraction(xtrain,xtrain1,1)
    X_test = FeatureExtraction(xtest,xtest1,0)


    X_train_new = X_train.reset_index(drop=True)
    y_train_new = y_train.reset_index(drop=True)
    X_train_new = X_train_new.values
    y_train_new = y_train_new.values
    
    # setting up data
    X_train_AL, X_simulation_AL, y_train_AL, y_simulation_AL = train_test_split(X_train_new, y_train, test_size=sim_size, random_state=4, stratify=y_train, shuffle=True)
    # Start AL loops simulated with a part of the test data
    X_simulation_df = pd.DataFrame(X_simulation_AL)
    y_simulation_df = pd.DataFrame(y_simulation_AL)
    # making copies of existing X_test and y_test data
    X_test_AL = pd.DataFrame(X_test)
    y_test_AL = pd.DataFrame(y_test)
    
    assert len(X_train_AL) == len(y_train_AL)
    assert len(X_simulation_df) == len(y_simulation_df)
    assert len(X_test_AL) == len(y_test_AL)
    return X_train_AL, y_train_AL, X_simulation_df, y_simulation_df, X_test_AL, y_test_AL

X_train, y_train, X_sim, y_sim, X_test, y_test = get_data()
print(f"Training set size: {len(X_train)}, Simulation set size: {len(X_sim)}, Test set size: {len(X_test)}")

7936 7936
7936 7936 7936 7936
7936
1985 1985
1985 1985 1985 1985
1985
Training set size: 5158, Simulation set size: 2778, Test set size: 1985


In [11]:
def grid_parameter_search(X_train, y_train, X_test, y_test, n_estimators_grid, max_depth_grid, query_size):
    best_model_score = 0
    
    preset_batch = partial(uncertainty_batch_sampling, n_instances=query_size)
    
    for ne in n_estimators_grid:
        for md in max_depth_grid:
            learner = ActiveLearner(
                  estimator=RandomForestClassifier(n_estimators=ne, max_depth=md, random_state=100),
                  X_training=X_train, y_training=np.ravel(y_train),
                  query_strategy=preset_batch
                )
            score = learner.score(X_test, y_test)
            print(f"[n_estimator: {ne}, max_depth: {md}, accuracy: {score}]")
            if best_model_score < score:
                best_ne = ne
                best_md = md
                best_model_score = score
                bestPerformingModel = learner
    print(f"best model found at ne:{best_ne}, md:{best_md}, at score:{best_model_score}")
    return learner, best_ne, best_md, best_model_score


learner, best_ne, best_md, best_model_score = grid_parameter_search(X_train, y_train, X_sim, y_sim, [i*100 for i in range(1,11)],[i*100 for i in range(1,11)], 100)
print(f"[BEST OBTAINED RF ESTIMATOR] === [n_estimator: {best_ne}, max_depth: {best_md}, accuracy: {best_model_score}]")

[n_estimator: 100, max_depth: 100, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 200, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 300, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 400, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 500, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 600, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 700, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 800, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 900, accuracy: 0.9233261339092873]
[n_estimator: 100, max_depth: 1000, accuracy: 0.9233261339092873]
[n_estimator: 200, max_depth: 100, accuracy: 0.9251259899208063]
[n_estimator: 200, max_depth: 200, accuracy: 0.9251259899208063]
[n_estimator: 200, max_depth: 300, accuracy: 0.9251259899208063]
[n_estimator: 200, max_depth: 400, accuracy: 0.9251259899208063]
[n_estimator: 200, max_depth: 500, accuracy: 0.9251259899208063]
[n_estimator: 200, max_d

In [9]:
def train_this(n_est, max_dept, query_size, per_iteration_acc_report = True):
    
    # getting data
    X_train, y_train, X_sim, y_sim, X_test, y_test = get_data()
    
    # init tracker lists
    num_queries = []
    time_history = []
    total_time_history = []
    train_acc_history = []
    sim_acc_history = []
    test_acc_history = []
    
    # AL cycle
    t_start = time.time()
    num_of_queries = int(len(X_sim)/query_size) + 1
    for i in range(num_of_queries):
        print(f"\n*********** QUERY {i} ***********remain:{len(X_simulation_df)}")
        t0 = time.time()
        
        #AL selection
        query_idx, query_inst = learner.query(np.array(X_sim))
        print(f"Nodes returned for query in iteration {i}: {query_idx}")
        
        #updating learner
        learner.teach(X = X_simulation_df.iloc[query_idx], y = y_simulation_df.iloc[query_idx])
        feed_to_learner.append(list(query_idx))
        
        X_simulation_df = X_simulation_df.drop(list(query_idx))
        y_simulation_df = y_simulation_df.drop(list(query_idx))

        #df version
        # delete queries that have been looped back into the model
        #X_simulation_df = X_simulation_df.drop(X_simulation_df.index[query_idx])
        #y_simulation_df = y_simulation_df.drop(y_simulation_df.index[query_idx])

        #np version
        # learner.teach(X = pd.Series(X_simulation_np[idx]), y = y_simulation_np[idx])
        # X_simulation_np = np.delete(X_simulation_np, query_index, axis=0)
        # y_simulation_np = np.delete(y_simulation_np, query_index)
        
        # recording history
        t1 = time.time()
        time_history.append(t1 - t0)
        total_time_history.append(t1 - t_start)
        num_queries.append(len(feed_to_learner))

        # Calculate and report our model's accuracy.
        if per_iteration_acc_report:
            test_acc = learner.score(X_test, y_test)
            sim_acc = learner.score(X_sim, y_sim)
            train_acc = learner.score(X_train, y_train)
                
            test_acc_history.append(  test_acc  )
            sim_acc_history.append(   sim_acc   )
            train_acc_history.append( train_acc )
            
    
            print(f"\n[INTERMEDIATE] test_acc:{test_acc}, sim_acc:{sim_acc}, train_acc:{train_acc}")
            
        print(f"\n[INTERMEDIATE] Time taken for query {i}: {time_history[-1]}")
        print(f"\n[INTERMEDIATE] Total time taken {i}: {total_time_history[-1]}")

   

In [None]:
# keep_percent = 0.2
query_sizes = [250,500] #[5, 10, 25, 50, 100]


In [13]:

his_folder = "./his"
img_folder = "./img"
feed_his_folder = "./feed"

test_data_1 = [i for i in range(100)]
test_data_2 = [i*2 for i in range(100)]
test_data_3 = [i*3 for i in range(100)]
test_data_4 = [i*4 for i in range(100)]

data = {"num_queries": test_data_1, "time": test_data_2, "total_time":test_data_3, "test_accuracy":test_data_4}

#his_file_name = "ubs_history_{prefix}__ne{best_ne}_md{best_md}_qs{query_size}.csv"#ubs is uncertainty batch sampling
his_file_name = "test.csv"
with open(f"{his_folder}/{his_file_name}", "wb") as outfile:
    writer = csv.writer(outfile)
    writer.writerow(data.keys())
    writer.writerows(zip(*d.values()))

plt.plot([1] + [i+1 for i in range(1, num_of_queries + 1)], model_accuracies)
plt.xlabel("Queries")
plt.ylabel("Model accuracy on test set")
plt.savefig(f"./img/{prefix}_ne{best_ne}_md{best_md}_qs{query_size}.png")
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: './his/test.csv'