In [None]:
#carga de datasets
from utils.DatasetStorage import Dataset
from utils.paths import *

#clasificadores
from utils.clasificacion import *

#adaptacion
from sklearn.model_selection import train_test_split
from utils.adaptacion import create_SDA

from keras.layers import Input, Dense, Dropout
from keras.models import Model

#otros
import os
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import itertools

#variables para guardar los resultados
tipo = pruebas[4]

In [None]:
def split_src_tgt(X_src, X_tgt, test_size=0.2):
    # se divide el dataset para los datos de entrenamiento y validacion del SDA
    X_tr_src, X_val_src, _, _ = train_test_split(X_src, np.zeros(X_src.shape[0]), test_size=0.2, random_state=42)
    
    X_tr_tgt, X_val_tgt, _, _ = train_test_split(X_tgt, np.zeros(X_tgt.shape[0]), test_size=0.2, random_state=42)
    
    X_train = np.concatenate((X_tr_src, X_tr_tgt))
    X_val = np.concatenate((X_val_src, X_val_tgt))
    
    return X_train, X_val

# Pruebas con el dataset Amazon (3000 Dimensiones)

In [None]:
dims = 3000
dataset_name = datasets[0]

In [None]:
print tipo
print dataset_name
print dims
print data_path

In [None]:
# cargando dataset
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

In [None]:
parametros = {
    'noises': [0.3 , 0.5, 0.8],
    'layers': [[int(dims/2)], [int(dims/2), int(dims/4)]],
    'epochs': [50],
}

noise_layer_epoch =  list(itertools.product(parametros['noises'], parametros['layers'], parametros['epochs']))

In [None]:
porcentajes = [25, 50, 100]
modelos = {}

pairs = list(itertools.permutations(domains, 2))

for src, tgt in pairs:
    modelos[(src, tgt)] = {}
    for ptje in porcentajes:
        modelos[(src, tgt)][ptje] = [100, None]

In [None]:
df = pd.DataFrame(columns=dataframe_columns+['Porcentaje'])

i=0
j=0
tareas = len(domains)*(len(domains)-1)
pairs = list(itertools.permutations(domains, 2))

porcentajes = [25, 50, 100]


for src, tgt in pairs:
    print "Tarea %d de %d" % (i+1, tareas)
    tarea = src[0]+'->'+tgt[0]
    
    #datos de entrenamiento
    X_src = labeled[src]['X_tr'][:, :dims].todense()
    y_src = np.asarray(labeled[src]['y_tr'].todense()).argmax(axis=1)
    
    X_tgt = labeled[tgt]['X_tr'][:, :dims].todense()
    y_tgt = np.asarray(labeled[tgt]['y_tr'].todense()).argmax(axis=1)
    
    X_train, X_val = split_src_tgt(X_src, X_tgt, test_size=0.2)
    
    
    #datos de prueba
    X_tgt_ts = labeled[tgt]['X_ts'][:, :dims].todense()
    y_tgt_ts = np.asarray(labeled[tgt]['y_ts'].todense()).argmax(axis=1)
    
    
    #baseline in-domain error
    model_name = "%d_%s.pkl" % (dims, tgt)
    model_path = os.path.join(models_path, dataset_name, "indomain", model_name)
    
    #Se realiza una clasificacion, estimando los parametros mediante cv
    svc = load_best_score(model_path, X_tgt, y_tgt)
    b_error = 1-svc.score(X_tgt_ts, y_tgt_ts)

    k = 0    
    for noise, layer, epoch in noise_layer_epoch:
        print "\tEntrenando modelo de adaptacion %d" % k
        # entrenar un modelo de adaptacion
        autoencoder, encoder = create_SDA(dims, layer, noise)
        autoencoder.fit(X_train, X_train,
                    epochs=epoch,
                    batch_size=256,
                    shuffle=True,
                    verbose=0,
                    validation_data=(X_val, X_val))

        # entrenar un clasificador con los datos adaptados
        X_src_a = encoder.predict(X_src)
        clf = get_best_score(X_src_a, y_src, classifier='SVC', n_jobs=4)

        # se prueba con distintos porcentajes
        j = i*3
        for porcentaje in porcentajes:
            if porcentaje == 100:
                X_test_pc, y_test_pc = X_tgt_ts, y_tgt_ts
            else:
                X_test_pc, _, y_test_pc, _ = train_test_split(X_tgt_ts, y_tgt_ts, train_size=porcentaje/float(100))
            
            X_test_pc_a = encoder.predict(X_test_pc)
            t_error = 1-clf.score(X_test_pc_a, y_test_pc)
            
            if t_error <= modelos[(src, tgt)][porcentaje][0]:
                modelos[(src, tgt)][porcentaje] = [t_error, encoder]
                t_loss = t_error - b_error
                df.loc[j] = ['SDA', tarea, src, tgt, b_error*100, t_error*100, t_loss*100, porcentaje]
    
            j += 1
        k += 1
    i+=1

In [None]:
df

In [None]:
new_scores_path = os.path.join(scores_path,dataset_name, tipo, "inductive_%d.csv" % (dims))

print "Guardando en %s" % new_scores_path
df.to_csv(new_scores_path, columns=df.columns)
print "Resultados guardados."

# Pruebas con el dataset Twitter (2000 Dimensiones)

In [None]:
dims = 2000
dataset_name = datasets[1]

In [None]:
print tipo
print dataset_name
print dims
print data_path

In [None]:
# cargando dataset
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

In [None]:
parametros = {
    'noises': [0.3 , 0.5, 0.8],
    'layers': [[int(dims/2)], [int(dims/2), int(dims/4)]],
    'epochs': [50],
}

noise_layer_epoch =  list(itertools.product(parametros['noises'], parametros['layers'], parametros['epochs']))

In [None]:
porcentajes = [25, 50, 100]
modelos = {}

pairs = list(itertools.permutations(domains, 2))

for src, tgt in pairs:
    modelos[(src, tgt)] = {}
    for ptje in porcentajes:
        modelos[(src, tgt)][ptje] = [100, None]

In [None]:
df = pd.DataFrame(columns=dataframe_columns+['Porcentaje'])

i=0
j=0
tareas = len(domains)*(len(domains)-1)
pairs = list(itertools.permutations(domains, 2))

porcentajes = [25, 50, 100]


for src, tgt in pairs:
    print "Tarea %d de %d" % (i+1, tareas)
    tarea = src[0]+'->'+tgt[0]
    
    #datos de entrenamiento
    X_src = labeled[src]['X_tr'][:, :dims].todense()
    y_src = np.asarray(labeled[src]['y_tr'].todense()).argmax(axis=1)
    
    X_tgt = labeled[tgt]['X_tr'][:, :dims].todense()
    y_tgt = np.asarray(labeled[tgt]['y_tr'].todense()).argmax(axis=1)
    
    X_train, X_val = split_src_tgt(X_src, X_tgt, test_size=0.2)
    
    
    #datos de prueba
    X_tgt_ts = labeled[tgt]['X_ts'][:, :dims].todense()
    y_tgt_ts = np.asarray(labeled[tgt]['y_ts'].todense()).argmax(axis=1)
    
    
    #baseline in-domain error
    model_name = "%d_%s.pkl" % (dims, tgt)
    model_path = os.path.join(models_path, dataset_name, "indomain", model_name)
    
    #Se realiza una clasificacion, estimando los parametros mediante cv
    svc = load_best_score(model_path, X_tgt, y_tgt)
    b_error = 1-svc.score(X_tgt_ts, y_tgt_ts)

    k = 0    
    for noise, layer, epoch in noise_layer_epoch:
        print "\tEntrenando modelo de adaptacion %d" % k
        # entrenar un modelo de adaptacion
        autoencoder, encoder = create_SDA(dims, layer, noise)
        autoencoder.fit(X_train, X_train,
                    epochs=epoch,
                    batch_size=256,
                    shuffle=True,
                    verbose=0,
                    validation_data=(X_val, X_val))

        # entrenar un clasificador con los datos adaptados
        X_src_a = encoder.predict(X_src)
        clf = get_best_score(X_src_a, y_src, classifier='SVC', n_jobs=4)

        # se prueba con distintos porcentajes
        j = i*3
        for porcentaje in porcentajes:
            if porcentaje == 100:
                X_test_pc, y_test_pc = X_tgt_ts, y_tgt_ts
            else:
                X_test_pc, _, y_test_pc, _ = train_test_split(X_tgt_ts, y_tgt_ts, train_size=porcentaje/float(100))
            
            X_test_pc_a = encoder.predict(X_test_pc)
            t_error = 1-clf.score(X_test_pc_a, y_test_pc)
            
            if t_error <= modelos[(src, tgt)][porcentaje][0]:
                modelos[(src, tgt)][porcentaje] = [t_error, encoder]
                t_loss = t_error - b_error
                df.loc[j] = ['SDA', tarea, src, tgt, b_error*100, t_error*100, t_loss*100, porcentaje]
    
            j += 1
        k += 1
    i+=1

In [None]:
df

In [None]:
new_scores_path = os.path.join(scores_path,dataset_name, tipo, "inductive_%d.csv" % (dims))

print "Guardando en %s" % new_scores_path
df.to_csv(new_scores_path, columns=df.columns)
print "Resultados guardados."