In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils.DatasetStorage import Dataset
from utils.paths import *

import os
import itertools

import matplotlib.pyplot as plt

# clasificador
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
import numpy as np
import pandas as pd

In [3]:
def get_dataset_pad(labeled, domains, dims):
    df = pd.DataFrame(columns=['Dominio 1', 'Dominio 2', 'PAD'])

    i = 0
    pairs = list(itertools.combinations(domains, 2))
    
    for src, tgt in pairs:
        print "%s - %s - " % (src, tgt),
        x_src = labeled[src]['X_tr'].todense()[:, :dims]
        x_tgt = labeled[tgt]['X_tr'].todense()[:, :dims]

        pad = get_pad(x_src, x_tgt)
        print pad

        df.loc[i] = [src, tgt, pad]
        i = i + 1
        
    df_full = get_full_dataframe(df)
    
    df_multiple = pd.DataFrame(columns=['Dominio 1', 'Dominio 2', 'PAD'])
    n_domains = list(itertools.combinations(domains, len(domains)-1))

    i = 0
    for domains_group in n_domains:
        print domains_group,

        tgt = list(set(domains).difference(domains_group))[0]
        x_tgt = labeled[tgt]['X_tr'].todense()[:, :dims]
        print " - %s - " % tgt,


        #se unen los datos de los tres dominios
        x_src = None
        for domain in domains_group:
            if x_src is None:
                x_src = labeled[domain]['X_tr'].todense()[:, :dims]
            else:
                x_src = np.concatenate([x_src, labeled[domain]['X_tr'].todense()[:, :dims]])

        pad = get_pad(x_src, x_tgt)
        print pad

        df_multiple.loc[i] = [tgt, domains_group, pad]
        i = i + 1
    
    df_full = df_full.append(df_multiple, ignore_index=True)
    df_full = df_full.sort_values(['Dominio 1', 'PAD'], ascending=[True, True]).reset_index(drop=True)
    
    return df_full

In [4]:
#calcula el valor de Proxy A-distance dados dos dominios X1 y X2
def get_pad(X1, X2):
    X = np.concatenate([X1, X2])
    
    y1 = np.zeros(X1.shape[0])
    y2 = np.ones(X2.shape[0])
    y = np.concatenate([y1, y2])
    
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=0.2, random_state=42)
    parametros = [{
        'kernel': ['linear'],
        'C': [1, 10, 100],
        'cache_size': [7000],
        'max_iter': [50000],
    }]

    clf = GridSearchCV(SVC(), parametros, cv=5, n_jobs = 4, scoring='roc_auc')
    clf.fit(X_tr, y_tr)
    
    e = 1 - clf.score(X_ts, y_ts)
    pad = 2 * (1 - 2*e)
    
    return pad

In [5]:
#agrega al dataframe df los mismos datos con columnas intercambiadas
def get_full_dataframe(df):
    df2 = df.reindex(columns=['Dominio 2', 'Dominio 1', 'PAD'])
    df2.columns = ['Dominio 1', 'Dominio 2', 'PAD']
    
    df3 = df.append(df2, ignore_index=True)
    df3 = df3.sort_values(['Dominio 1', 'PAD'], ascending=[True, True]).reset_index(drop=True)
    
    return df3

In [6]:
def get_pad_multiple_dimensions(dimensions, dataset_name):
    df = None

    for dim in dimensions:
        pad_path = os.path.join(scores_path, dataset_name, "pad_%d.csv" % (dim))
        print pad_path

        df_temp = pd.read_csv(pad_path, sep=',', header=0, index_col=0)
        df_temp = df_temp.loc[df_temp.groupby('Dominio 1')['PAD'].idxmin()]

        df_temp['Dimensiones'] = dim

        if df is None:
            df = df_temp
        else:
            df = df.append(df_temp, ignore_index=True)

        df = df[['Dominio 1', 'Dimensiones', 'Dominio 2', 'PAD']]
        df = df.sort_values(['Dominio 1', 'Dimensiones']).reset_index(drop=True)


    return df

# Amazon

## 3000 Dimensiones

In [7]:
#variables para guardar los resultados
dataset_name = datasets[0]
dims = 3000

In [8]:
print dataset_name
print dims
print data_path

amazon
3000
data


In [9]:
# cargando dataset Amazon
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [10]:
df_amazon_3000 = get_dataset_pad(labeled, domains, dims)

electronics - dvd -  1.97322103445
electronics - kitchen -  1.64480568562
electronics - books -  1.98397973908
dvd - kitchen -  1.97243572754
dvd - books -  1.96627106831
kitchen - books -  1.99041925573
('electronics', 'dvd', 'kitchen')  - books -  1.94727405091
('electronics', 'dvd', 'books')  - kitchen -  1.80810504745
('electronics', 'kitchen', 'books')  - dvd -  1.93450711821
('dvd', 'kitchen', 'books')  - electronics -  1.81231125971


In [11]:
df_amazon_3000

Unnamed: 0,Dominio 1,Dominio 2,PAD
0,books,"(electronics, dvd, kitchen)",1.947274
1,books,dvd,1.966271
2,books,electronics,1.98398
3,books,kitchen,1.990419
4,dvd,"(electronics, kitchen, books)",1.934507
5,dvd,books,1.966271
6,dvd,kitchen,1.972436
7,dvd,electronics,1.973221
8,electronics,kitchen,1.644806
9,electronics,"(dvd, kitchen, books)",1.812311


In [12]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_amazon_3000.to_csv(pad_path, columns=df_amazon_3000)
print "Resultados guardados."

Guardando en scores/amazon/pad_3000.csv
Resultados guardados.


## 1000 Dimensiones

In [13]:
dims = 1000
dataset_name = datasets[0]

print dataset_name
print dims

amazon
1000


In [14]:
# cargando dataset Amazon
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [15]:
df_amazon_1000 = get_dataset_pad(labeled, domains, dims)

electronics - dvd -  1.86834329691
electronics - kitchen - 



 1.42005084862
electronics - books -  1.96772388609
dvd - kitchen -  1.88797596961
dvd - books -  1.83041297327
kitchen - books -  1.9608131853
('electronics', 'dvd', 'kitchen')  - books -  1.84743043572
('electronics', 'dvd', 'books')  - kitchen - 



 1.63993744607
('electronics', 'kitchen', 'books')  - dvd -  1.75575657895
('dvd', 'kitchen', 'books')  - electronics -  1.70706158326


In [16]:
df_amazon_1000

Unnamed: 0,Dominio 1,Dominio 2,PAD
0,books,dvd,1.830413
1,books,"(electronics, dvd, kitchen)",1.84743
2,books,kitchen,1.960813
3,books,electronics,1.967724
4,dvd,"(electronics, kitchen, books)",1.755757
5,dvd,books,1.830413
6,dvd,electronics,1.868343
7,dvd,kitchen,1.887976
8,electronics,kitchen,1.420051
9,electronics,"(dvd, kitchen, books)",1.707062


In [17]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_amazon_1000.to_csv(pad_path, columns=df_amazon_1000)
print "Resultados guardados."

Guardando en scores/amazon/pad_1000.csv
Resultados guardados.


## Todas las dimensiones

In [18]:
dataset_name = datasets[0]
dimensions = [3000, 1000]
df_amazon_multiple = get_pad_multiple_dimensions(dimensions, dataset_name)

scores/amazon/pad_3000.csv
scores/amazon/pad_1000.csv


In [19]:
df_amazon_multiple

Unnamed: 0,Dominio 1,Dimensiones,Dominio 2,PAD
0,books,1000,dvd,1.830413
1,books,3000,"('electronics', 'dvd', 'kitchen')",1.947274
2,dvd,1000,"('electronics', 'kitchen', 'books')",1.755757
3,dvd,3000,"('electronics', 'kitchen', 'books')",1.934507
4,electronics,1000,kitchen,1.420051
5,electronics,3000,kitchen,1.644806
6,kitchen,1000,electronics,1.420051
7,kitchen,3000,electronics,1.644806


In [20]:
pad_path = os.path.join(scores_path,dataset_name, "pad_multiple.csv")

print "Guardando en %s" % pad_path
df_amazon_multiple.to_csv(pad_path, columns=df_amazon_multiple)
print "Resultados guardados."

Guardando en scores/amazon/pad_multiple.csv
Resultados guardados.


# Twitter
## 2000 Dimensiones

In [22]:
#variables para guardar los resultados
dataset_name = datasets[1]
dims = 2000

In [23]:
print dataset_name
print dims
print data_path

twitter
2000
data


In [24]:
# cargando dataset Twitter
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [25]:
df_twitter_2000 = get_dataset_pad(labeled, domains, dims)

rio2016 - thevoice -  1.71820541342
rio2016 - general -  1.71254286839
thevoice - general -  1.60388480445
('rio2016', 'thevoice')  - general -  1.74924276688
('rio2016', 'general')  - thevoice -  1.69884060703
('thevoice', 'general')  - rio2016 -  1.72589763178


In [26]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_twitter_2000.to_csv(pad_path, columns=df_twitter_2000)
print "Resultados guardados."

Guardando en scores/twitter/pad_2000.csv
Resultados guardados.


## 1000 Dimensiones

In [27]:
#variables para guardar los resultados
dataset_name = datasets[1]
dims = 1000

In [28]:
print dataset_name
print dims
print data_path

twitter
1000
data


In [29]:
# cargando dataset Twitter
dataset_path = os.path.join(data_path, dataset_name+'.pkl')
dataset_object = Dataset().load(dataset_path)

dataset_object.split_dataset(test_size=0.2)

labeled = dataset_object.labeled
domains = dataset_object.domains

Dataset already splitted


In [30]:
df_twitter_1000 = get_dataset_pad(labeled, domains, dims)

rio2016 - thevoice -  1.67692497837
rio2016 - general -  1.56389497948
thevoice - general -  1.37758298169
('rio2016', 'thevoice')  - general -  1.58536518509
('rio2016', 'general')  - thevoice -  1.46757470073
('thevoice', 'general')  - rio2016 -  1.74854087089


In [31]:
pad_path = os.path.join(scores_path,dataset_name, "pad_%d.csv" % (dims))

print "Guardando en %s" % pad_path
df_twitter_1000.to_csv(pad_path, columns=df_twitter_1000)
print "Resultados guardados."

Guardando en scores/twitter/pad_1000.csv
Resultados guardados.


## Todas las dimensiones

In [32]:
dataset_name = datasets[1]
dimensions = [2000, 1000]
df_twitter_multiple = get_pad_multiple_dimensions(dimensions, dataset_name)

scores/twitter/pad_2000.csv
scores/twitter/pad_1000.csv


In [33]:
df_twitter_multiple

Unnamed: 0,Dominio 1,Dimensiones,Dominio 2,PAD
0,general,1000,thevoice,1.377583
1,general,2000,thevoice,1.603885
2,rio2016,1000,general,1.563895
3,rio2016,2000,general,1.712543
4,thevoice,1000,general,1.377583
5,thevoice,2000,general,1.603885


In [34]:
pad_path = os.path.join(scores_path,dataset_name, "pad_multiple.csv")

print "Guardando en %s" % pad_path
df_twitter_multiple.to_csv(pad_path, columns=df_twitter_multiple)
print "Resultados guardados."

Guardando en scores/twitter/pad_multiple.csv
Resultados guardados.
