In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('../../datasets/Womens_Clothing_E-Commerce_Reviews.csv', keep_default_na=False)
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


# Preprocesamiento básico

* Mezclas el texto de la reseña (title, text description) en un solo atributo
* Converit el sistema de ranking de 5 estrellas en un valor de recomendacióin binarion (0 y 1)

In [3]:
df['Review'] = (df['Title'].map(str) +' '+ df['Review Text']).apply(lambda row: row.strip())
df['Rating'] = [1 if rating > 3 else 0 for rating in df['Rating']]
df = df[['Review', 'Rating']]
df.head()

Unnamed: 0,Review,Rating
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,Some major design flaws I had such high hopes ...,0
3,"My favorite buy! I love, love, love this jumps...",1
4,Flattering shirt This shirt is very flattering...,1


Eliminar todos los registros que no tienen reseñas

In [4]:
df = df[df['Review'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22642 entries, 0 to 23485
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  22642 non-null  object
 1   Rating  22642 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 530.7+ KB


Existe un desbalance en los datos de acuerdo al rating

In [5]:
df['Rating'].value_counts()

Rating
1    17449
0     5193
Name: count, dtype: int64

Separación de los datos en train y test

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['Review']], df['Rating'], test_size=.3, random_state=42)
X_train.shape, X_test.shape

((15849, 1), (6793, 1))

In [7]:
from collections import Counter
Counter(y_train), Counter(y_test)

(Counter({1: 12172, 0: 3677}), Counter({1: 5277, 0: 1516}))

# Experimento 1. NLP basado en características de conteo

Se pueden crear un número de características básicas basadas en texto. En algunas ocasiones pueden ayudar a mejorar los modelos de clasificación. Por ejemplo:

* Conteo de palabras: el total de palabras en un documento
* Conteo de caracteres: el número total de caracteres en el documento
* Densidad promedio de palabra: el promedio del tamaño de palabra utilizado en el documento
* Conteo de puntuación: el número total de signos de puntuación en los documentos
* Conteo de mayúsculas: el número total de palabras en mayúsculas en el documento
* Conteo de palabras de título: el número de palabras propias (títulos) en el documento.

In [8]:
import string

X_train['char_count'] = X_train['Review'].apply(len)
X_train['word_count'] = X_train['Review'].apply(lambda x: len(x.split()))
X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)
X_train['punctuation_count'] = X_train['Review'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_train['title_word_count'] = X_train['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_train['upper_case_word_count'] = X_train['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


X_test['char_count'] = X_test['Review'].apply(len)
X_test['word_count'] = X_test['Review'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)
X_test['punctuation_count'] = X_test['Review'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_test['title_word_count'] = X_test['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_test['upper_case_word_count'] = X_test['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [9]:
X_train.head()

Unnamed: 0,Review,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count
4654,Sexy and flows I love this jumpsuit! i'm usual...,469,95,4.885417,17,2,1
5333,Wanted to love it The dresss is much shorter t...,144,26,5.333333,4,2,0
22502,So cute! though inside not soft I got the crea...,451,93,4.797872,14,2,1
392,Lovely fabric but tiny hips If your hips are b...,396,83,4.714286,6,2,0
372,"So cool This has great drape, length, the patt...",292,56,5.122807,10,2,0


In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier( n_neighbors= 3)
knn.fit(X_train.drop(['Review'], axis=1), y_train)

In [11]:
knn.score(X_train.drop(['Review'], axis=1), y_train)

0.8237743706227523

In [12]:
knn.score(X_test.drop(['Review'], axis=1), y_test)

0.7007213307816871

In [13]:
predictions = knn.predict(X_test.drop(['Review'], axis=1))

In [14]:
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.23      0.15      0.18      1516
           1       0.78      0.86      0.82      5277

    accuracy                           0.70      6793
   macro avg       0.50      0.50      0.50      6793
weighted avg       0.66      0.70      0.67      6793



Unnamed: 0,0,1
0,222,1294
1,739,4538


In [15]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1, random_state=42, solver='liblinear')

In [16]:
lr.fit(X_train.drop(['Review'], axis=1), y_train)
predictions = lr.predict(X_test.drop(['Review'], axis=1))

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1516
           1       0.78      1.00      0.87      5277

    accuracy                           0.78      6793
   macro avg       0.39      0.50      0.44      6793
weighted avg       0.60      0.78      0.68      6793



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,0,1
0,0,1516
1,0,5277


In [21]:
! pip install -U textblob
! python -m textblob.download_corpora

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m114.0 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1
[nltk_data] Downloading package brown to /Users/cmillan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/cmillan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cmillan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cmillan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/cmillan/nltk_data...
[nltk_data]   Packag

In [22]:
import textblob

textblob.TextBlob('This is an AMAZING pair of Jeans!').sentiment

Sentiment(polarity=0.7500000000000001, subjectivity=0.9)

In [23]:
textblob.TextBlob('I really hated this UGLY T-shirt!!').sentiment

Sentiment(polarity=-0.95, subjectivity=0.85)

# Segundo experimento: Analisis de Sentimientos



In [24]:
x_train_snt_obj = X_train['Review'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_train['Polarity'] = [obj.polarity for obj in x_train_snt_obj.values]
X_train['Subjectivity'] = [obj.subjectivity for obj in x_train_snt_obj.values]

x_test_snt_obj = X_test['Review'].apply(lambda row: textblob.TextBlob(row).sentiment)
X_test['Polarity'] = [obj.polarity for obj in x_test_snt_obj.values]
X_test['Subjectivity'] = [obj.subjectivity for obj in x_test_snt_obj.values]

In [25]:
X_train.head()

Unnamed: 0,Review,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
4654,Sexy and flows I love this jumpsuit! i'm usual...,469,95,4.885417,17,2,1,0.23738,0.641259
5333,Wanted to love it The dresss is much shorter t...,144,26,5.333333,4,2,0,0.26,0.4
22502,So cute! though inside not soft I got the crea...,451,93,4.797872,14,2,1,0.180093,0.562963
392,Lovely fabric but tiny hips If your hips are b...,396,83,4.714286,6,2,0,0.114286,0.61044
372,"So cool This has great drape, length, the patt...",292,56,5.122807,10,2,0,0.321667,0.721111


In [26]:
knn = KNeighborsClassifier( n_neighbors= 3)
knn.fit(X_train.drop(['Review'], axis=1), y_train)

In [27]:
predictions = knn.predict(X_test.drop(['Review'], axis=1))

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.25      0.15      0.19      1516
           1       0.78      0.87      0.82      5277

    accuracy                           0.71      6793
   macro avg       0.51      0.51      0.51      6793
weighted avg       0.66      0.71      0.68      6793



Unnamed: 0,0,1
0,231,1285
1,697,4580


In [28]:
lr.fit(X_train.drop(['Review'], axis=1), y_train)
predictions = lr.predict(X_test.drop(['Review'], axis=1))

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.67      0.27      0.39      1516
           1       0.82      0.96      0.89      5277

    accuracy                           0.81      6793
   macro avg       0.75      0.62      0.64      6793
weighted avg       0.79      0.81      0.77      6793



Unnamed: 0,0,1
0,411,1105
1,201,5076


In [33]:
import nltk
import contractions
import re

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')

# load up a simple porter stemmer - nothing fancy
ps = nltk.porter.PorterStemmer()

def simple_text_preprocessor(document): 
    # lower case
    document = str(document).lower()
    
    # expand contractions
    document = contractions.fix(document)
    
    # remove unnecessary characters
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
    
    # simple porter stemming
    document = ' '.join([ps.stem(word) for word in document.split()])
    
    # stopwords removal
    document = ' '.join([word for word in document.split() if word not in stop_words])
    
    return document
   
stp = np.vectorize(simple_text_preprocessor)

In [34]:
X_train['Clean Review'] = stp(X_train['Review'].values)
X_test['Clean Review'] = stp(X_test['Review'].values)

X_train.head()

Unnamed: 0,Review,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,Clean Review
4654,Sexy and flows I love this jumpsuit! i'm usual...,469,95,4.885417,17,2,1,0.23738,0.641259,sexi flow love thi jumpsuit usual size small h...
5333,Wanted to love it The dresss is much shorter t...,144,26,5.333333,4,2,0,0.26,0.4,want love dresss much shorter describ veri see...
22502,So cute! though inside not soft I got the crea...,451,93,4.797872,14,2,1,0.180093,0.562963,cute though insid not soft got cream color abs...
392,Lovely fabric but tiny hips If your hips are b...,396,83,4.714286,6,2,0,0.114286,0.61044,love fabric but tini hip hip bigger size us ca...
372,"So cool This has great drape, length, the patt...",292,56,5.122807,10,2,0,0.321667,0.721111,cool thi ha great drape length pattern super v...


)