In [3]:
import io
import pickle5 as pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import numpy as np

In [4]:
dir_path = '/Volumes/GoogleDrive/내 드라이브/data/amazon/'

In [5]:
domains = ['books', 'dvd', 'electronics', 'kitchen']
kinds = ['negative', 'positive', 'unlabeled']
code = np.eye(2)

In [8]:
def preprocess_words(words: list) -> str:
    document = ''
    for i in range(len(words)):
        tmp = words[i].split(':')
        for j in range(int(tmp[1])):
            document += tmp[0] + ' '
    return document

In [9]:
def file_to_corpus(file: io.TextIOWrapper) -> (list, list):
    corpus = []
    labels = []
    for review in file:
        contents = review.split()
        words = contents[:-1]
        document = preprocess_words(words)
        label = contents[-1].split(':')[1]
        y = code[0] if label == 'negative' else code[1]
        corpus.append(document)
        labels.append(y)
    return corpus, labels

In [10]:
for domain in domains:
    save_path = dir_path + '%s/%s.pkl' % (domain, domain)
    corpus_train = []
    corpus_test = []
    labels_train = []
    labels_test = []
    # training set (labeled)
    for kind in kinds[:-1]:
        file_path = dir_path + '%s/%s.review' % (domain, kind)
        with open(file_path, 'r', encoding='UTF8') as r:
            corpus, labels = file_to_corpus(r)
            corpus_train.extend(corpus)
            labels_train.extend(labels)
    # test set (unlabeled)
    file_path = dir_path + '%s/%s.review' % (domain, kinds[-1])
    with open(file_path, 'r', encoding='UTF8') as r:
        corpus, labels = file_to_corpus(r)
        corpus_test.extend(corpus)
        labels_test.extend(labels)
    with open(save_path, 'wb') as w:
        pickle.dump([corpus_train, labels_train, corpus_test, labels_test], w)

In [11]:
corpus_trains = []
labels_trains = []
corpus_tests = []
labels_tests = []
for domain in domains:
    save_path = dir_path + '%s/%s.pkl' % (domain, domain)
    with open(save_path, 'rb') as r:
        corpus_train, labels_train, corpus_test, labels_test = pickle.load(r)
    corpus_trains.append(corpus_train)
    corpus_tests.append(corpus_test)
    labels_trains.append(labels_train)
    labels_tests.append(labels_test)

In [29]:
for i, source in enumerate(domains):
    for j, target in enumerate(domains):
        if i == j: continue
        corpus = []
        corpus.extend(corpus_trains[i])
        corpus.extend(corpus_trains[j])
        tf_idf = CountVectorizer(max_features=5000)
        tf_idf.fit_transform(corpus)
        source_train = tf_idf.transform(corpus_trains[i])
        source_train_y = pd.DataFrame(labels_trains[i])
        target_train = tf_idf.transform(corpus_trains[j])
        target_test = tf_idf.transform(corpus_tests[j])
        target_test_y = pd.DataFrame(labels_tests[j])
        save_path = dir_path + '/%s_to_%s.pkl' % (source, target)
        print(save_path)
        with open(save_path, 'wb') as w:
            pickle.dump([(source_train, source_train_y),
                         target_train,
                         (target_test, target_test_y)], w)

/Volumes/GoogleDrive/내 드라이브/data/amazon//books_to_dvd.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//books_to_electronics.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//books_to_kitchen.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//dvd_to_books.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//dvd_to_electronics.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//dvd_to_kitchen.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//electronics_to_books.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//electronics_to_dvd.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//electronics_to_kitchen.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//kitchen_to_books.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//kitchen_to_dvd.pkl
/Volumes/GoogleDrive/내 드라이브/data/amazon//kitchen_to_electronics.pkl


In [32]:
tmp_path = dir_path + '/books_to_dvd.pkl'
with open(tmp_path, 'rb') as r:
    data = pickle.load(r)
print(data[0][1].shape)

(2000, 2)
