<a href="https://colab.research.google.com/github/churroxd8/nlp-notebooks/blob/main/Rese%C3%B1as_de_Amazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# =============================================================
# 1. Instalación y carga de librerías
# =============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
from wordcloud import WordCloud

# Descarga de recursos de NLTK
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Configuración para visualización de resultados
plt.style.use('ggplot')
pd.set_option('display.max_colwidth', 200)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
# =========================================================
# 2. Carga y preparación de los datos
# =========================================================
def load_and_prepare_data(file_path):
  print("Cargando el dataset...")
  try:
    # Intento estándar
    df = pd.read_csv(file_path)
  except:
    # Fallback de encoding
    df = pd.read_csv(file_path, encoding='latin1')

  # Identificamos las columnas que nos interesan
  cols_map = {
      'text': 'reviews.text',
      'title': 'reviews.title',
      'rating': 'reviews.rating'
  }

  # Verificamos y ajustamos el nombre de las columnas si difieren
  for key, val in cols_map.items():
    if val not in df.columns:
      # Buscamos columnas similares
      candidates = [c for c in df.columns if key in c.lower()]
      if candidates:
        cols_map[key] = candidates[0]
      else:
        print(f"⚠️ Advertencia: No se encontraron columnas para '{key}'")

  print(f"Usando columnas: {cols_map}")

  # Limpiamos los nulos
  df = df.dropna(subset=[cols_map['text']]).copy()

  # Procesamos el texto (Título + Cuerpo)
  title_col = cols_map.get('title')
  text_col = cols_map.get['text']

  # Rellenamos vacíos con strings vacíos para concatenar
  df[text_col] = df[text_col].fillna('')
  if title_col:
    df[title_col] = df[title_col].fillna('')
    df['combined_raw'] = df[title_col].astype(str) + ". " + df[text_col].astype(str)
  else:
    df['combined_raw'] = df[text_col].astype(str)

  # Procesamos Rating (valor numérico)
  rating_col = cols_map.get('rating')
  if rating_col:
    # Forzamos a numérico, los errores se convierten en NaN
    df['rating_score'] = pd.to_numeric(df[rating_col], errors='coerce')
    # Llenamos los ratings faltantes con la media
    df = df.dropna(subset=['rating_score'])
  else:
    df['rating_score'] = np.nan

  # 3. Limpieza del texto
  def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+', '', text) # Elimina URLs
    text = re.sub(r'<.*?>', '', text) # Elimina etiquetas html
    text = re.sub(r'[^a-z\s\.]', '', text) # Mantiene solo letras y signos de puntuación para separar frases
    text = re.sub(r'\s+', ' ', text).strip() # Normaliza los espacios
    return text

  df['clean_text'] = df['combined_raw'].apply(clean_text)

  # Nos aseguramos de tomar reseñas con contenido
  df = df[df['clean_text'].str.len() > 3].copy()

  print(f"Dataset listo: {len(df)} reseñas. Rating promedio global: {df['rating_score'].mean:.2}")
  return df

FILE_PATH = '/content/drive/MyDrive/amazon_reviews.csv'

In [10]:
# ========================================================
# 3. Pipeline de extracción (Sustantivos)
# ========================================================
def extract_nouns_pipeline(df):
  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  # Stopwords para limpiar el ruido
  domain_stopwords = {
      'amazon',
      'product',
      'one',
      'device',
      'tablet',
      'kindle',
      'use',
      'would',
      'get',
      'buy',
      'item',
      'review',
      'star',
      'purchase'
  }
  stop_words.update(domain_stopwords)

  all_nouns = []

  def process_row_nouns(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    row_nouns = []
    for word, tag in tagged:
      # Filtramos palabras cortas y stopwords
      if len(word) > 2 and word not in stop_words:
        # Solo sustantivos (NN, NNS)
        if tag.startswith('NN'):
          lemma = lemmatizer.lemmatize(word)
          # Segunda revisión tras la lematización
          if lemma not in domain_stopwords:
            row_nouns.append(lemma)
    return row_nouns

  print("Ejecutando POS Tagging y extracción de sustantivos...")
  df['nouns'] = df['clean_text'].apply(process_row_nouns)

  # Lista global para el conteo
  all_nouns = [noun for row in df['nouns'] for noun in row]

  return df, all_nouns


In [None]:
# ==========================================================
# 4. Análisis de impacto de aspectos