In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
from utils.DatasetStorage import Dataset
from utils.paths import *

import os
import itertools

import matplotlib.pyplot as plt

# clasificador
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
import pandas as pd

In [3]:
def get_dataset_pad(labeled, domains, dims):
    df = pd.DataFrame(columns=['Dominio 1', 'Dominio 2', 'PAD'])

    i = 0
    pairs = list(itertools.combinations(domains, 2))
    
    for src, tgt in pairs:
        print "%s - %s - " % (src, tgt),
        x_src = labeled[src]['X_tr'].todense()[:, :dims]
        x_tgt = labeled[tgt]['X_tr'].todense()[:, :dims]

        pad = get_pad(x_src, x_tgt)
        print pad

        df.loc[i] = [src, tgt, pad]
        i = i + 1
        
    df_full = get_full_dataframe(df)
    
    df_multiple = pd.DataFrame(columns=['Dominio 1', 'Dominio 2', 'PAD'])
    n_domains = list(itertools.combinations(domains, len(domains)-1))

    i = 0
    for domains_group in n_domains:
        print domains_group,

        tgt = list(set(domains).difference(domains_group))[0]
        x_tgt = labeled[tgt]['X_tr'].todense()[:, :dims]
        print " - %s - " % tgt,


        #se unen los datos de los tres dominios
        x_src = None
        for domain in domains_group:
            if x_src is None:
                x_src = labeled[domain]['X_tr'].todense()[:, :dims]
            else:
                x_src = np.concatenate([x_src, labeled[domain]['X_tr'].todense()[:, :dims]])

        pad = get_pad(x_src, x_tgt)
        print pad

        df_multiple.loc[i] = [tgt, domains_group, pad]
        i = i + 1
    
    df_full = df_full.append(df_multiple, ignore_index=True)
    df_full = df_full.sort_values(['Dominio 1', 'PAD'], ascending=[True, True]).reset_index(drop=True)
    
    return df_full

In [4]:
#calcula el valor de Proxy A-distance dados dos dominios X1 y X2
def get_pad(X1, X2):
    X = np.concatenate([X1, X2])
    
    y1 = np.zeros(X1.shape[0])
    y2 = np.ones(X2.shape[0])
    y = np.concatenate([y1, y2])
    
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.2, random_state=42)
    parametros = [{
        'kernel': ['linear'],
        'C': [1, 10, 100],
        'cache_size': [7000],
        'max_iter': [50000],
    }]

    clf = GridSearchCV(SVC(), parametros, cv=5, n_jobs = 3, scoring='roc_auc')
    clf.fit(X_tr, y_tr)
    
    e = 1 - clf.score(X_ts, y_ts)
    pad = 2 * (1 - 2*e)
    
    return pad

In [5]:
#agrega al dataframe df los mismos datos con columnas intercambiadas
def get_full_dataframe(df):
    df2 = df.reindex(columns=['Dominio 2', 'Dominio 1', 'PAD'])
    df2.columns = ['Dominio 1', 'Dominio 2', 'PAD']
    
    df3 = df.append(df2, ignore_index=True)
    df3 = df3.sort_values(['Dominio 1', 'PAD'], ascending=[True, True]).reset_index(drop=True)
    
    return df3

In [72]:
def get_pad_multiple_dimensions(dimensions, dataset_name):
    df = None

    for dim in dimensions:
        pad_path = os.path.join(scores_path, dataset_name, "pad_%d.csv" % (dim))
        print pad_path

        df_temp = pd.read_csv(pad_path, sep=',', header=0, index_col=0)
        df_temp = df_temp.loc[df_temp.groupby('Dominio 1')['PAD'].idxmin()]

        df_temp['Dimensiones'] = dim

        if df is None:
            df = df_temp
        else:
            df = df.append(df_temp, ignore_index=True)

        df = df[['Dominio 1', 'Dimensiones', 'Dominio 2', 'PAD']]
        df = df.sort_values(['Dominio 1', 'Dimensiones']).reset_index(drop=True)


    return df

# Amazon

## 3000 Dimensiones

In [5]:
#variables para guardar los resultados
dataset_name = datasets[0]
dims = 3000

In [6]:
print dataset_name
print dims
print data_path

amazon
3000
data


In [7]:
# cargando dataset Amazon
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [None]:
df_amazon_3000 = get_dataset_pad(labeled, domains, dims)

In [None]:
df_amazon_3000

In [32]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_amazon_3000.to_csv(pad_path, columns=df_amazon_3000)
print "Resultados guardados."

Guardando en scores/amazon/pad_3000.csv
Resultados guardados.


## 1000 Dimensiones

In [26]:
dims = 1000
dataset_name = datasets[0]

print dataset_name
print dims

amazon
1000


In [28]:
# cargando dataset Amazon
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

 Dataset already splitted


In [31]:
df_amazon_1000 = get_dataset_pad(labeled, domains, dims)

kitchen - dvd -  1.86390631289
kitchen - electronics - 



 1.49492986228
kitchen - books -  1.9608131853
dvd - electronics -  1.85770238831
dvd - books -  1.83041297327
electronics - books -  1.96772388609
('kitchen', 'dvd', 'electronics')  - books -  1.8421996333
('kitchen', 'dvd', 'books')  - electronics - 



 1.71258897757
('kitchen', 'electronics', 'books')  - dvd -  1.74437823555
('dvd', 'electronics', 'books')  - kitchen -  1.70060396894


In [32]:
df_amazon_1000

Unnamed: 0,Dominio 1,Dominio 2,PAD
0,books,dvd,1.830413
1,books,"(kitchen, dvd, electronics)",1.8422
2,books,kitchen,1.960813
3,books,electronics,1.967724
4,dvd,"(kitchen, electronics, books)",1.744378
5,dvd,books,1.830413
6,dvd,electronics,1.857702
7,dvd,kitchen,1.863906
8,electronics,kitchen,1.49493
9,electronics,"(kitchen, dvd, books)",1.712589


In [33]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_amazon_1000.to_csv(pad_path, columns=df_amazon_1000)
print "Resultados guardados."

Guardando en scores/amazon/pad_1000.csv
Resultados guardados.


## Todas las dimensiones

In [75]:
dataset_name = datasets[0]
dimensions = [3000, 1000]
df_amazon_multiple = get_pad_multiple_dimensions(dimensions, dataset_name)

scores/amazon/pad_3000.csv
scores/amazon/pad_1000.csv


In [74]:
df_amazon_multiple

Unnamed: 0,Dominio 1,Dimensiones,Dominio 2,PAD
0,books,1000,dvd,1.830413
1,books,3000,"('kitchen', 'dvd', 'electronics')",1.937379
2,dvd,1000,"('kitchen', 'electronics', 'books')",1.744378
3,dvd,3000,"('kitchen', 'electronics', 'books')",1.924571
4,electronics,1000,kitchen,1.49493
5,electronics,3000,kitchen,1.712538
6,kitchen,1000,electronics,1.49493
7,kitchen,3000,electronics,1.712538


In [77]:
pad_path = os.path.join(scores_path,dataset_name, "pad_multiple.csv")

print "Guardando en %s" % pad_path
df_amazon_multiple.to_csv(pad_path, columns=df_amazon_multiple)
print "Resultados guardados."

Guardando en scores/amazon/pad_multiple.csv
Resultados guardados.


# Twitter
## 2000 Dimensiones

In [6]:
#variables para guardar los resultados
dataset_name = datasets[1]
dims = dimensions[dataset_name]

In [7]:
print dataset_name
print dims
print data_path

twitter
2000
data


In [8]:
# cargando dataset Twitter
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [20]:
df_twitter_2000 = get_dataset_pad(labeled, domains, dims)

thevoice - general -  1.60388480445
thevoice - rio2016 -  1.5818581212
general - rio2016 -  1.75551345207
('thevoice', 'general')  - rio2016 -  1.72589763178
('thevoice', 'rio2016')  - general - 



 1.76542905308
('general', 'rio2016')  - thevoice -  1.68177962108


In [23]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_twitter_2000.to_csv(pad_path, columns=df_twitter_2000)
print "Resultados guardados."

Guardando en scores/twitter/pad_2000.csv
Resultados guardados.


## 1000 Dimensiones

In [8]:
#variables para guardar los resultados
dataset_name = datasets[1]
dims = 1000

In [9]:
print dataset_name
print dims
print data_path

twitter
1000
data


In [10]:
# cargando dataset Twitter
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [11]:
df_twitter_1000 = get_dataset_pad(labeled, domains, dims)

thevoice - general - 



 1.37758298169
thevoice - rio2016 -  1.58857733557
general - rio2016 - 



 1.87866616142
('thevoice', 'general')  - rio2016 - 



 1.74854087089
('thevoice', 'rio2016')  - general - 



 1.60445705358
('general', 'rio2016')  - thevoice - 



 1.48508341974


In [12]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_twitter_1000.to_csv(pad_path, columns=df_twitter_1000)
print "Resultados guardados."

Guardando en scores/twitter/pad_1000.csv
Resultados guardados.


## Todas las dimensiones

In [67]:
dataset_name = datasets[1]
dimensions = [2000, 1000]
df_twitter_multiple = get_pad_multiple_dimensions(dimensions, dataset_name)

scores/twitter/pad_2000.csv
scores/twitter/pad_1000.csv


In [68]:
df_twitter_multiple

Unnamed: 0,Dominio 1,Dimensiones,Dominio 2,PAD
0,general,1000,thevoice,1.377583
1,general,2000,thevoice,1.603885
2,rio2016,1000,thevoice,1.588577
3,rio2016,2000,thevoice,1.581858
4,thevoice,1000,general,1.377583
5,thevoice,2000,rio2016,1.581858


In [70]:
pad_path = os.path.join(scores_path,dataset_name, "pad_multiple.csv")

print "Guardando en %s" % pad_path
df_twitter_multiple.to_csv(pad_path, columns=df_twitter_multiple)
print "Resultados guardados."

Guardando en scores/twitter/pad_multiple.csv
Resultados guardados.
