In [1]:
import os
import pandas as pd
import numpy as np
import re
import unicodedata

from stop_words import get_stop_words

# Carga de datos

In [2]:
df = pd.read_csv('../data/corpus.csv')

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment,category
0,A36BI9SPO3LTC8,B0023ZQDEC,Gerbera Daisy,"[0, 0]",Not a good fit for me. Much too big. I didn't ...,3,Not good for me,1360972800,"02 16, 2013",0,clothing
1,AL1WTXQJCKT8X,B00HWF7KPY,"Granny ""Joyous-soul""","[0, 0]",good fit- not to thin- love the less thicker ...,5,Surprised,1403308800,"06 21, 2014",0,clothing
2,A2QOT4L1QFYYIW,B003VFGW76,lu shenglan,"[0, 0]",Quality is very goodMy husband liked itThe nex...,5,Quality is very good,1387152000,"12 16, 2013",0,clothing
3,A2KQUKZ4X13BI0,B001CGW432,musica_al,"[2, 2]",I like wearing bungee style slip on shoes. Th...,5,Just what I wanted,1305331200,"05 14, 2011",0,clothing
4,ATJOVP7P6JRPU,B005OBWI3W,T.W. Connell,"[0, 0]",I thought the bracelet would be packaged in a ...,3,Nice product poor packaging,1355356800,"12 13, 2012",0,clothing


# Eliminamos valores nulos

In [4]:
df = df.dropna(subset=['reviewText', 'sentiment'])

# Función de preprocesado de texto

Implementamos una función de preprocesado de texto que nos permitirá estandarizar el formato de las reviews a la entrada y reducir la cardinalidad del vocabulario. Se realizan los siguientes pasos:
- Eliminar tildes
- Eliminar símbolos
- Eliminar todo carácter que no sea una letra
- Elimar stopwords
- Lemmatization en base a diccionario de token-lemma

In [5]:
def sentence_normalization(sentence):
    sentence = unicodedata.normalize('NFKD', sentence).lower().encode('ascii', errors='ignore').decode('utf-8')
    sentence = re.sub(' +', ' ', ' '.join([word if word.isalpha() else '' for word in sentence.split()])).strip()
    return sentence

In [6]:
def remove_stopwords(sentence, sw_list):
    sentence = ' '.join([word for word in sentence.split() if word not in sw_list])
    return sentence

In [7]:
def get_lemmas_dict(data_path, lemmas_dict_file):
    lemmas_dict = {}
    with open(os.path.join(data_path, lemmas_dict_file), 'r') as f:
        for line in f:
            (key, val) = line.split()
            lemmas_dict[str(val)] = key
    return lemmas_dict

In [8]:
def lemmatize(sentence, lemmas_dict):
    sentence = ' '.join([lemmas_dict.get(word, word) for word in sentence.split()])
    return sentence

In [9]:
def process_reviews(reviews, sw_list, lemmas_dict):
    processed_sentences = []
    for sent in df['reviewText']:
        if not sent != sent:  # check if sent is not nan
            sent = sentence_normalization(sent)
            sent = remove_stopwords(sent, sw_list)
            sent = lemmatize(sent, english_lemmas_dict)
            processed_sentences.append(sent)
        else:
            processed_sentences.append('None')
    return processed_sentences

In [10]:
lemmas_path = '../data'
english_lemmas = 'lemmatization-en.txt'

In [11]:
english_lemmas_dict = get_lemmas_dict(lemmas_path, english_lemmas)
sw_list = get_stop_words('en')

In [12]:
processed_reviews = process_reviews(df['reviewText'], sw_list, english_lemmas_dict)

Podemos ver el efecto del preprocesado:

In [13]:
print('Review original: {}'.format(df['reviewText'].values[0]))
print('Review procesada: {}'.format(processed_reviews[0]))

Review original: Not a good fit for me. Much too big. I didn't have enough bust to fit into it. Back it went. It might work for someone -- that someone is not me.
Review procesada: good fit much enough bust fit back may work someone someone


In [14]:
df.loc[:, 'processedReview'] = processed_reviews

In [15]:
df['processedReview'] = df['processedReview'].replace('', np.nan)
df = df.dropna(subset=['processedReview'])

In [16]:
df.to_csv('../data/corpus.csv', index=False)