In [1]:
import numpy as np
import fasttext
from collections import defaultdict
import pickle
import csv
from gensim.models import KeyedVectors
from sklearn.preprocessing import normalize
from lifelong_dnn import LifeLongDNN

In [6]:
#load pre-trained fasttext vector and also Bing entity data

# fasttext_emb = KeyedVectors.load_word2vec_format('/home/weiwya/lifelong-learning/language_detection/data/wiki-news-300d-1M.vec')
wanted = set(['biology.organism_classification', 'book.written_work', 
          'film.character', 'tv.series_episode', 'organization.organization',
          'film.film','music.group', 'commerce.consumer_product', 'book.book',
          'music.artist','people.person','film.actor','book.edition', 'media_common.actor',
          'book.author','american_football.player','tv.program', 'business.operation',
          'education.field_of_study','education.school'])

stuff = pickle.load(open('/mnt/c/Users/weiwya/Downloads/llf_training_data.pickle', 'rb'))
entities = defaultdict(list)
for item in stuff:
    itype = item[1].split('/')[-1]
    if itype in wanted:
        entities[itype].append(item[-1])
for k, v in entities.items():
    entities[k] = np.array(v).reshape(-1, 300)
    print(k, entities[k].shape)
labels = {t:i for i, t in enumerate(entities.keys())}
idToLabels = {v:k for k, v in labels.items()}

biology.organism_classification (19087, 300)
book.written_work (66485, 300)
film.character (15900, 300)
tv.series_episode (20104, 300)
organization.organization (54972, 300)
film.film (52436, 300)
music.group (11748, 300)
commerce.consumer_product (12739, 300)
book.book (44924, 300)
music.artist (33772, 300)
people.person (19482, 300)
film.actor (24186, 300)
book.edition (13636, 300)
media_common.actor (21307, 300)
book.author (19681, 300)
american_football.player (13008, 300)
tv.program (17527, 300)
business.operation (18216, 300)
education.field_of_study (19200, 300)
education.school (30247, 300)


In [9]:
def generate_train_test(entities,tasks, train_size=10000, test_size=1000, ):
    labels = {t:i for i, t in enumerate(entities.keys())}
    idToLabels = {v:k for k, v in labels.items()}
    trains, tests = [], []
    train_labels, test_labels = [], []
    for t in tasks:
        dd = entities[idToLabels[t]]
        if dd.shape[0] < train_size + test_size:
            print('adjusted train_size:')
            train_size = dd.shape[0] - test_size
            print(train_size)
        
        idx = np.random.choice(dd.shape[0], dd.shape[0], replace=False )
        dd = dd[idx]
        train = dd[:train_size]
        test = dd[train_size: train_size + test_size]

        trains.append(train)
        tests.append(test)

        train_labels+=[t]* train_size
        test_labels+= [t]*test_size
    
    #shuffle orders for good meature   
    idx = np.random.choice(len(train_labels), len(train_labels), replace= False)
    trains = np.vstack(trains)[idx]
    train_labels = np.array(train_labels)[idx]
    
    idx = np.random.choice(len(test_labels), len(test_labels), replace=False)
    tests = np.vstack(tests)[idx]
    test_labels = np.array(test_labels)[idx]
    return trains, train_labels, tests, test_labels
  

def get_score(prediction, labels):
    acc = (labels == prediction).sum()/labels.shape[0]
    return acc

In [10]:
def run_iter(seed, entities, n_tasks=5, task_per_batch = 4,  train_size=10000, test_size = 1000, do_random_task = False):
    #TODO: turn parallel back on once locking due to ram is fixed
    llf = LifeLongDNN(acorn=seed, parallel=False)
    prv_test, prv_labels = {},{}
    np.random.seed(seed)
    efficiency = defaultdict(list)
    batch_idx = np.arange(len(entities)).reshape(n_tasks, task_per_batch)
    forward_acc = {}
    reverse_eff= defaultdict(list)
    forward_eff= {}
    
    for iteration in range(n_tasks):
        if do_random_task:
            cc = np.random.choice(len(entities), task_per_batch, replace=False)
        else:
            cc = batch_idx[iteration%n_tasks]
        print(iteration, cc, [idToLabels[x] for x in cc])

        train, train_labels , test, test_labels = generate_train_test(entities, cc, train_size=train_size, test_size=test_size)        
        prv_test[iteration] = test
        prv_labels[iteration] = test_labels

        llf.new_forest(train, train_labels)
        
        p0 = llf.predict(test, representation=iteration, decider=iteration)  
        p1 = llf.predict(test, representation='all', decider=iteration)
        a0 = get_score(p0, test_labels)
        a1 = get_score(p1, test_labels)
        forward_acc[iteration] = a1
        forward_eff[iteration] = (1-a0) / (1-a1)
        
        
        for j in range(iteration):
            p0 = llf.predict(prv_test[j], representation=j, decider=j)  
            p1 = llf.predict(prv_test[j], representation='all', decider=j)
            e0 = 1 - get_score(p0, prv_labels[j])
            e1 = 1 - get_score(p1, prv_labels[j])
            eff = e0 / e1
            print ('%i, org_error: %s tran_error: %s tran_eff: %s' %(j, e0, e1, eff ))
            reverse_eff[j].append(eff)
        print()
            
    return seed, (forward_acc, forward_eff, reverse_eff)

In [12]:
#run non-overlapping batches w different random seed
res = []
for i in range(10):
    res.append(run_iter(i*100, entities))

0 [0 1 2 3] ['biology.organism_classification', 'book.written_work', 'film.character', 'tv.series_episode']

1 [4 5 6 7] ['organization.organization', 'film.film', 'music.group', 'commerce.consumer_product']
0, org_error: 0.4225 tran_error: 0.42500000000000004 tran_eff: 0.9941176470588234

2 [ 8  9 10 11] ['book.book', 'music.artist', 'people.person', 'film.actor']
0, org_error: 0.4225 tran_error: 0.42000000000000004 tran_eff: 1.005952380952381
1, org_error: 0.6225 tran_error: 0.6 tran_eff: 1.0375

3 [12 13 14 15] ['book.edition', 'media_common.actor', 'book.author', 'american_football.player']
0, org_error: 0.4225 tran_error: 0.4275 tran_eff: 0.9883040935672515
1, org_error: 0.6225 tran_error: 0.595 tran_eff: 1.046218487394958
2, org_error: 0.52 tran_error: 0.5175000000000001 tran_eff: 1.004830917874396

4 [16 17 18 19] ['tv.program', 'business.operation', 'education.field_of_study', 'education.school']
0, org_error: 0.4225 tran_error: 0.4375 tran_eff: 0.9657142857142856
1, org_error:

0, org_error: 0.4075 tran_error: 0.41000000000000003 tran_eff: 0.9939024390243901
1, org_error: 0.615 tran_error: 0.585 tran_eff: 1.0512820512820513
2, org_error: 0.5075000000000001 tran_error: 0.4675 tran_eff: 1.0855614973262033
3, org_error: 0.4625 tran_error: 0.42000000000000004 tran_eff: 1.101190476190476

0 [0 1 2 3] ['biology.organism_classification', 'book.written_work', 'film.character', 'tv.series_episode']

1 [4 5 6 7] ['organization.organization', 'film.film', 'music.group', 'commerce.consumer_product']
0, org_error: 0.38249999999999995 tran_error: 0.36750000000000005 tran_eff: 1.040816326530612

2 [ 8  9 10 11] ['book.book', 'music.artist', 'people.person', 'film.actor']
0, org_error: 0.38249999999999995 tran_error: 0.365 tran_eff: 1.047945205479452
1, org_error: 0.5974999999999999 tran_error: 0.625 tran_eff: 0.9559999999999998

3 [12 13 14 15] ['book.edition', 'media_common.actor', 'book.author', 'american_football.player']
0, org_error: 0.38249999999999995 tran_error: 0.3

In [13]:
#run overlapping batches w different random seed
res = []
for i in range(10):
    res.append(run_iter(i*100, entities, do_random_task = True ))

0 [18  1 19  8] ['education.field_of_study', 'book.written_work', 'education.school', 'book.book']

1 [ 9  8 15  0] ['music.artist', 'book.book', 'american_football.player', 'biology.organism_classification']
0, org_error: 0.3225 tran_error: 0.31999999999999995 tran_eff: 1.0078125000000002

2 [ 0  9 12  2] ['biology.organism_classification', 'music.artist', 'book.edition', 'film.character']
0, org_error: 0.3225 tran_error: 0.3075 tran_eff: 1.048780487804878
1, org_error: 0.27249999999999996 tran_error: 0.265 tran_eff: 1.0283018867924527

3 [1 2 0 3] ['book.written_work', 'film.character', 'biology.organism_classification', 'tv.series_episode']
0, org_error: 0.3225 tran_error: 0.31000000000000005 tran_eff: 1.040322580645161
1, org_error: 0.27249999999999996 tran_error: 0.26249999999999996 tran_eff: 1.0380952380952382
2, org_error: 0.4325 tran_error: 0.38 tran_eff: 1.138157894736842

4 [ 3  0 17 10] ['tv.series_episode', 'biology.organism_classification', 'business.operation', 'people.pe

0, org_error: 0.37749999999999995 tran_error: 0.365 tran_eff: 1.0342465753424657
1, org_error: 0.21999999999999997 tran_error: 0.1875 tran_eff: 1.1733333333333331

3 [ 9 10 14  4] ['music.artist', 'people.person', 'book.author', 'organization.organization']
0, org_error: 0.37749999999999995 tran_error: 0.37250000000000005 tran_eff: 1.013422818791946
1, org_error: 0.21999999999999997 tran_error: 0.18000000000000005 tran_eff: 1.2222222222222217
2, org_error: 0.4475 tran_error: 0.44499999999999995 tran_eff: 1.00561797752809

4 [ 4  7 17  2] ['organization.organization', 'commerce.consumer_product', 'business.operation', 'film.character']
0, org_error: 0.37749999999999995 tran_error: 0.36 tran_eff: 1.048611111111111
1, org_error: 0.21999999999999997 tran_error: 0.1875 tran_eff: 1.1733333333333331
2, org_error: 0.4475 tran_error: 0.42500000000000004 tran_eff: 1.052941176470588
3, org_error: 0.52 tran_error: 0.5175000000000001 tran_eff: 1.004830917874396

0 [13 11 12 18] ['media_common.actor