# Librerias axuliares y archivos

## Liberías

In [8]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf

## Google drive mount

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
RUTA = "/content/drive/MyDrive/Entrega-NLP-Bartolomé_Flores_Vicaria"

In [11]:
%cd "$RUTA"

/content/drive/MyDrive/Entrega-NLP-Bartolomé_Flores_Vicaria


In [12]:
#Borramos la carpeta cache para crear "Bag of Words"
!rm -rf "cache"

In [13]:
#Funciones auxiliares para procesamiento
from helper_functions import *

# 1. Obtención de datos Descarga y Eda (Etapa 1)

In [16]:
path_csv = "reviews_large.csv"
df = pd.read_csv(path_csv, sep=';', decimal='.')

In [17]:
df = df.sample(frac=1,random_state=42)

In [18]:
df.head()

Unnamed: 0,sentiment,review
33553,4,Solid material but no clip.
9427,1,It tastes terrible
199,1,"I've had this wedge for maybe 2 months now, an..."
12447,2,fells good but mildews quickly even though I p...
39489,4,came as promised even earlier works amazing lo...


In [19]:
# Función para contar palabras en un texto
def contar_palabras(texto):
  if pd.isna(texto):
    return 0
  return len(texto.split())

# Crear la nueva columna 'palabras_en_review'
df['word_count'] =  df['review'].apply(contar_palabras)

In [20]:
df = df[ (df['word_count'] >=25) & (df['word_count']<= 100)]

In [21]:
df =df[['sentiment','review']]

In [22]:
df.to_csv('reviews_large_processed.csv', sep=';', decimal='.', index=False)

In [23]:
df.head()

Unnamed: 0,sentiment,review
39489,4,came as promised even earlier works amazing lo...
10822,2,Was disappointed in the size. It is much too b...
4144,1,"Do not, I say DO NOT order. I miscalculated a..."
36958,4,"I love this little powder puff! No, the lid do..."
38695,4,I don't pick books apart. I rate simply on enj...


# Datos entrenamiento / test

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['review'].to_numpy(),
                                                    df['sentiment'].to_numpy(),
                                                    train_size=0.1,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    stratify=df['sentiment'],
                                                    random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_test,
                                                    y_test,
                                                    test_size=0.5,
                                                    shuffle=True,
                                                    stratify=y_test,
                                                    random_state=42)

print(f'Dimensiones del dataset de training:   {X_train.shape}')
print(f'Dimensiones del dataset de validation: {X_valid.shape}')
print(f'Dimensiones del dataset de test:       {X_test.shape}')


Dimensiones del dataset de training:   (1978,)
Dimensiones del dataset de validation: (989,)
Dimensiones del dataset de test:       (990,)


# Preprocesado BOW

  Para el procesado se aplicará la siguiente función para cada review.
  Los pasos a realizar son:


*   Eliminar etiquetas HTML
*   Pasar a minúsculas
*   Quitar texto y números
*   Dividir por espacio
*   Eliminacion stopwords
*   Eliminar elementos no alfabéticos
*   Aplicación stemming (raíz de la palabra)


```
def review_to_words(review):
    """Convert a raw review string into a sequence of words."""
    # Eliminamos las etiquetas HTML
    #text = re.sub( re.compile('<.*?>'), '', review)
    text = BeautifulSoup(review, "html5lib").get_text()
    # Convertimos a minúscula y quitamos todo lo que no sea texto o números
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # Dividimos en tokens por espacios
    words = text.split()
    # Eliminamos stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # remove remaining tokens that are not alphabetic
    words = [w for w in words if w.isalpha()]
    # Aplicamos stemming
    words = [PorterStemmer().stem(w) for w in words]

    return words
```



In [25]:
# BeautifulSoup to easily remove HTML tags
from bs4 import BeautifulSoup

# RegEx for removing non-letter characters
import re

# NLTK library for the remaining steps
import nltk
nltk.download("stopwords")   # download list of stopwords (only once; need not run it again)
from nltk.corpus import stopwords # import stopwords

from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def review_to_words(review):
    """Convert a raw review string into a sequence of words."""
    # Eliminamos las etiquetas HTML
    #text = re.sub( re.compile('<.*?>'), '', review)
    text = BeautifulSoup(review, "html5lib").get_text()
    # Convertimos a minúscula y quitamos todo lo que no sea texto o números
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # Dividimos en tokens por espacios
    words = text.split()
    # Eliminamos stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # remove remaining tokens that are not alphabetic
    words = [w for w in words if w.isalpha()]
    # Aplicamos stemming
    words = [PorterStemmer().stem(w) for w in words]

    return words


In [27]:
cache_dir = os.path.join("cache", "sentiment_analysis")
os.makedirs(cache_dir, exist_ok=True)

words_train , words_valid, words_test, labels_train ,labels_valid , labels_test = preprocess_data(X_train, X_valid , X_test , y_train, y_valid, y_test)

  text = BeautifulSoup(review, "html5lib").get_text()


Wrote preprocessed data to cache file: preprocessed_data.pkl


Para extraccción de características el único parametro a modificar ha sido el min_df para que el vocabulario sea más reducido.

In [32]:
features_train, features_valid , features_test, vocabulary = extract_BoW_features(words_train,words_valid, words_test)

Wrote features to cache file: bow_features.pkl


# Vocabulario

In [33]:
vocabulary

{'even': 60,
 'last': 106,
 'week': 227,
 'hold': 92,
 'return': 176,
 'amazon': 6,
 'see': 183,
 'box': 24,
 'worth': 233,
 'free': 74,
 'good': 80,
 'paid': 146,
 'would': 234,
 'way': 225,
 'go': 79,
 'work': 232,
 'month': 127,
 'noth': 134,
 'differ': 51,
 'cut': 47,
 'sure': 206,
 'long': 112,
 'great': 82,
 'far': 66,
 'run': 179,
 'water': 224,
 'issu': 99,
 'sinc': 190,
 'found': 73,
 'hard': 88,
 'enough': 59,
 'might': 123,
 'soft': 195,
 'kind': 103,
 'big': 20,
 'look': 114,
 'tri': 217,
 'buy': 29,
 'product': 162,
 'say': 181,
 'tast': 208,
 'said': 180,
 'contain': 43,
 'lot': 115,
 'like': 110,
 'take': 207,
 'much': 128,
 'ok': 137,
 'reason': 170,
 'powder': 157,
 'fine': 69,
 'stick': 200,
 'hand': 86,
 'everi': 62,
 'time': 215,
 'open': 140,
 'make': 118,
 'seem': 184,
 'almost': 2,
 'someth': 196,
 'order': 141,
 'two': 219,
 'item': 100,
 'find': 68,
 'ship': 187,
 'realli': 169,
 'love': 116,
 'remov': 173,
 'read': 168,
 'brand': 25,
 'other': 142,
 'fit': 71,

# Ejemplos preprocesado

In [34]:
print("Vocabulary: {} words".format(len(vocabulary)))

import random
print("Sample words: {}".format(random.sample(list(vocabulary.keys()), 8)))

print("\n--- Preprocessed words ---")
print(words_train[5])
print("\n--- Bag-of-Words features ---")
print(features_train[5])
print("\n--- Label ---")
print(labels_train[5])

Vocabulary: 236 words
Sample words: ['realli', 'materi', 'definit', 'last', 'receiv', 'ship', 'strong', 'bit']

--- Preprocessed words ---
['look', 'legitim', 'studi', 'tri', 'avoid', 'bias', 'websit', 'inform', 'buy', 'product', 'studi', 'claim', 'heal', 'properti', 'say', 'otherwis', 'dislik', 'tast', 'said', 'contain', 'lot', 'antioxid', 'believ', 'benefici', 'bodi', 'like', 'everyth', 'take', 'much', 'harm', 'bodi', 'drink', 'gallon', 'gallon', 'week']

--- Bag-of-Words features ---
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0]

--- Label ---
3


In [35]:
np.asarray(features_train.sum(axis=0))

array([ 49,  76,  56,  39, 144,  43,  48,  71,  36,  60,  45,  68,  57,
        58, 150,  90,  49, 130,  71, 149,  73, 106, 108, 173,  61,  79,
        37,  48,  43, 194,  58,  49,  56,  47,  44,  52,  60,  65,  66,
        93,  59,  40,  44,  41,  40, 142,  41,  46, 188,  39,  62, 154,
        37, 112,  55,  99,  48,  58,  61,  90, 180,  42,  66,  97,  42,
        43,  69, 178,  91,  44, 145, 136,  68,  86,  39,  54, 374, 121,
        61, 145, 357, 157, 323,  45,  85,  40,  71,  42,  89,  50, 191,
        45,  73,  85,  38,  63, 110,  48,  45,  42, 110,  56, 137,  45,
       108,  43, 109,  42,  69, 104, 474, 166,  94,  45, 216,  83, 177,
       135, 204,  53,  42,  65,  65,  44,  38,  43,  97, 137, 212, 199,
        77,  58, 139,  42,  62,  63,  77,  51,  62, 496,  69, 149,  49,
        40,  42,  88,  36,  73,  60,  39,  52,  37,  62,  67,  52,  82,
        73,  44,  82, 161,  43, 100, 614,  37, 181, 138, 107,  63,  68,
       254,  49, 102, 155,  39,  68,  47, 112, 137,  65,  40,  4

In [36]:
Index = 20
print("\n--- Raw review ---")
print(X_train[Index])
print(len(X_train[Index]))
print("\n--- Preprocessed words ---")
print(words_train[Index])
print(len(words_train[Index]))
print("\n--- Bag-of-Words features ---")
print(features_train[Index])
print(len(features_train[Index]))
print("\n--- Label ---")
print(labels_train[Index])


--- Raw review ---
Product was ok. Taste was good.  It didn't really have the pow factor I was looking for though. That's why I gave it 3 stars
124

--- Preprocessed words ---
['product', 'ok', 'tast', 'good', 'realli', 'pow', 'factor', 'look', 'though', 'gave', 'star']
11

--- Bag-of-Words features ---
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
236

--- Label ---
3


In [37]:
Index = 20
print("\n--- Raw review ---")
print(X_valid[Index])
print(len(X_valid[Index]))
print("\n--- Preprocessed words ---")
print(words_valid[Index])
print(len(words_valid[Index]))
print("\n--- Bag-of-Words features ---")
print(features_valid[Index])
print(len(features_valid[Index]))
print("\n--- Label ---")
print(labels_valid[Index])


--- Raw review ---
I first stumbled upon this in Hawai about 5 years ago in Walmart. After I moved to NY I discovered it on Amazon. I love it, it works. I would 5 stars if it was reasonably priced (and if it didn't make my poop turned scary green )
229

--- Preprocessed words ---
['first', 'stumbl', 'upon', 'hawai', 'year', 'ago', 'walmart', 'move', 'ny', 'discov', 'amazon', 'love', 'work', 'would', 'star', 'reason', 'price', 'make', 'poop', 'turn', 'scari', 'green']
22

--- Bag-of-Words features ---
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 1 1]
236

--- Labe

In [38]:
Index = 20
print("\n--- Raw review ---")
print(X_test[Index])
print(len(X_test[Index]))
print("\n--- Preprocessed words ---")
print(words_test[Index])
print(len(words_test[Index]))
print("\n--- Bag-of-Words features ---")
print(features_test[Index])
print(len(features_test[Index]))
print("\n--- Label ---")
print(labels_test[Index])


--- Raw review ---
My hair is too thick for this but my daughter, who has thinner hair, loves this brush. She uses it in the shower and as her daily brush.
136

--- Preprocessed words ---
['hair', 'thick', 'daughter', 'thinner', 'hair', 'love', 'brush', 'use', 'shower', 'daili', 'brush']
11

--- Bag-of-Words features ---
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
236

--- Label ---
4
