In [1]:
import os
import argparse

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import pandas as pd

from utils.DatasetStorage import Dataset
from utils.paths import *

In [2]:
def read_amazon_file(path, labeled=True):
    """read_amazon_file."""
    file = open(path)
    comentarios = []
    labels = []
    for line in file:
        line = line.split("#label#:")
        labels.append(line[1][:-1])

        pares = line[0].split(" ")
        comentario = ""
        for par in pares:
            palabra = par.split(":")[0]
            palabra = str.replace(palabra, "_", " ")
            comentario = comentario + palabra + " "
        comentario = comentario + "."
        comentarios.append(comentario)

    if labeled:
        return comentarios, labels
    else:
        return comentarios

def read_all_amazon_domains(path):
    """read_all_amazon_domains."""
    file_names = ['positive.review', 'negative.review', 'unlabeled.review']

    domains = []

    labeled = {}
    unlabeled = {}

    print 'Leyendo dominio: '
    for folder in os.listdir(path):
        print "- %s" % folder

        instances = []
        labels = []
        for file_name in file_names[0:2]:
            file_path = os.path.join(path, folder, file_name)
            new_instances, new_labels = read_amazon_file(file_path)
            instances += new_instances
            labels += new_labels

        labeled[folder] = {
            'X': instances,
            'y': labels,
        }

        # datos sin etiquetas
        file_path = os.path.join(path, folder, file_names[2])
        instances = read_amazon_file(file_path, labeled=False)

        unlabeled[folder] = {
            'X': instances,
        }

        domains.append(folder)

    return labeled, unlabeled, domains

In [46]:
def preprocesar(labeled, unlabeled, dims, stop_words=None):
    """preprocesar."""

    instances = []
    labels = []
    labeled_lims ={}
    unlabeled_lims = {}

    for v_l in labeled.values():
        instances += v_l['X']
        labels += v_l['y']

    if unlabeled is not None:
        for v_ul in unlabeled.values():
            instances += v_ul['X']



    x_cv = CountVectorizer(max_features=dims, ngram_range=(1, 2), binary=True, stop_words=stop_words)
    x_cv.fit(instances)

    y_cv = CountVectorizer()
    y_cv.fit(labels)


    for d_l in labeled:
        labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X'])
        labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y'])

    if unlabeled is not None:
        for d_ul in unlabeled:
            unlabeled[d_ul]['X'] = x_cv.transform(unlabeled[d_ul]['X'])

    return labeled, unlabeled

In [4]:
path = os.path.join(raw_path, raw_folders['amazon'])
path

'raw_data/multi-domain/processed_acl'

In [16]:
labeled, unlabeled, domains = read_all_amazon_domains(path)

Leyendo dominio: 
- electronics
- dvd
- kitchen
- books


In [47]:
dims = 2000
labeled, unlabeled = preprocesar(labeled, unlabeled, dims)

AttributeError: lower not found

In [26]:
instances = []
labels = []
ant = 0
labeled_lims ={}
unlabeled_lims = {}

for v_l in labeled.values():
    instances += v_l['X']
    labels += v_l['y']

if unlabeled is not None:
    for v_ul in unlabeled.values():
        instances += v_ul['X']

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

x_cv = CountVectorizer(max_features=200, ngram_range=(1, 2), binary=True, stop_words=None)
x_cv.fit(instances)

CountVectorizer(analyzer=u'word', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
y_cv = CountVectorizer()
y_cv.fit(labels)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [36]:
for d_l in labeled:
    labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X'])
    labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y'])

In [44]:
if unlabeled is not None:
    for d_ul in unlabeled:
        unlabeled[d_ul]['X'] = x_cv.transform(unlabeled[d_ul]['X'])

In [None]:
for d_l in labeled:
        labeled[d_l]['X'] = x_cv.transform(labeled[d_l]['X'])
        labeled[d_l]['y'] = y_cv.transform(labeled[d_l]['y'])

if unlabeled is not None:
    for d_ul in unlabeled:
        unlabeled[d_ul]['X'] = x_cv.transform(unlabeled[d_ul]['X'])