In [4]:
import os
import argparse

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import pandas as pd

from utils.DatasetStorage import Dataset
from utils.paths import *

In [5]:
def read_amazon_file(path, labeled=True):
    """read_amazon_file."""
    file = open(path)
    comentarios = []
    labels = []
    for line in file:
        line = line.split("#label#:")
        labels.append(line[1][:-1])

        pares = line[0].split(" ")
        comentario = ""
        for par in pares:
            palabra = par.split(":")[0]
            palabra = str.replace(palabra, "_", " ")
            comentario = comentario + palabra + " "
        comentario = comentario + "."
        comentarios.append(comentario)

    if labeled:
        return comentarios, labels
    else:
        return comentarios

def read_all_amazon_domains(path):
    """read_all_amazon_domains."""
    file_names = ['positive.review', 'negative.review', 'unlabeled.review']

    domains = []

    labeled = {}
    unlabeled = {}

    print 'Leyendo dominio: '
    for folder in os.listdir(path):
        print "- %s" % folder

        instances = []
        labels = []
        for file_name in file_names[0:2]:
            file_path = os.path.join(path, folder, file_name)
            new_instances, new_labels = read_amazon_file(file_path)
            instances += new_instances
            labels += new_labels

        labeled[folder] = {
            'X': instances,
            'y': labels,
        }

        # datos sin etiquetas
        file_path = os.path.join(path, folder, file_names[2])
        instances = read_amazon_file(file_path, labeled=False)

        unlabeled[folder] = {
            'X': instances,
        }

        domains.append(folder)

    return labeled, unlabeled, domains

In [6]:
def preprocesar(labeled, unlabeled, dims, stop_words=None):
    """preprocesar."""

    instances = []
    labels = []
    ant = 0
    labeled_lims ={}
    unlabeled_lims = {}

    for v_l in labeled.values():
        instances += v_l['X']
        labels += v_l['y']

    if unlabeled is not None:
        for v_ul in unlabeled.values():
            instances += v_ul['X']



    x_cv = CountVectorizer(max_features=dims, ngram_range=(1, 2), binary=True, stop_words=stop_words)
    x_cv.fit(instances)

    y_cv = CountVectorizer()
    y_cv.fit(labels)


    for d_l in labeled:
        labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X'])
        labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y'])

    if unlabeled is not None:
        for d_ul in unlabeled:
            unlabeled[d_ul]['X'] = x_cv.transform(labeled[d_ul]['X'])

    return labeled, unlabeled

In [7]:
path = os.path.join(raw_path, raw_folders['amazon'])
path

'raw_data/multi-domain/processed_acl'

In [8]:
labeled, unlabeled, domains = read_all_amazon_domains(path)

Leyendo dominio: 
- electronics
- dvd
- kitchen
- books


In [9]:
instances = []
labels = []
ant = 0
labeled_lims ={}
unlabeled_lims = {}

for v_l in labeled.values():
    instances += v_l['X']
    labels += v_l['y']

if unlabeled is not None:
    for v_ul in unlabeled.values():
        instances += v_ul['X']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

x_cv = CountVectorizer(max_features=200, ngram_range=(1, 2), binary=True, stop_words=None)


In [18]:
from sklearn.feature_extraction.text import HashingVectorizer
dims = 200
stop_words = None
x2 = HashingVectorizer(stop_words=stop_words, n_features=dims, binary=True, ngram_range=(1,2))

In [20]:
x3 = x2.fit_transform(instances)

In [21]:
x3

<27677x200 sparse matrix of type '<type 'numpy.float64'>'
	with 3824700 stored elements in Compressed Sparse Row format>

In [None]:
labeled, unlabeled = preprocesar(labeled, unlabeled, 20)