# Preparación de datos
## Importar bibliotecas

In [157]:
import pandas as pd 
import numpy as np 
import scipy as sp
import math
import re
from tqdm import tqdm
import json
import gzip
from unidecode import unidecode
import matplotlib.pyplot as plt

import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, BooleanType
from pyspark.sql.functions import from_json, col
from pyspark.sql.functions import explode


# Importar funciones necesarias
from pyspark.sql.functions import col, to_date, weekofyear,year,trim, month, dayofmonth, sum
from pyspark.sql.functions import col,  count, coalesce, sum as spark_sum
from pyspark.sql.functions import regexp_replace, col, when, explode_outer,lit, to_timestamp,regexp_extract
from pyspark.sql.functions import format_number
from pyspark.sql.types import IntegerType,FloatType
# Puedes obtener estadísticas específicas para una columna
from pyspark.sql.functions import mean, min, max
from pyspark.sql.functions import approx_count_distinct
from pyspark.sql.window import Window
from pyspark.sql.functions import log1p
from pyspark.sql import functions as F

from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *


## Inicio de Sessión Cluster

In [2]:

spark = SparkSession.builder.appName("MLops_Steam").getOrCreate()
#spark = SparkSession.builder.master("spark://localhost:7077").appName("MLops1").getOrCreate()


In [3]:
spark

## Importación de Datos `user_reviews.json.gz`

In [None]:


def clean_text(text):
    # Reemplazar caracteres no ASCII con un espacio en blanco
    cleaned_text = ''.join(char if ord(char) < 128 else ' ' for char in text)
    return cleaned_text

def read_and_clean_reviews(file_path):
    cleaned_reviews = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            try:
                # Limpiar caracteres no ASCII
                cleaned_line = clean_text(line)
                
                # Convertir la línea limpiada a un diccionario
                review_dict = eval(cleaned_line)
                
                # Agregar el diccionario limpiado a la lista de revisiones
                cleaned_reviews.append(review_dict)
            except Exception as e:
                print(f"Error al procesar la línea: {line}")
                print(f"Error: {e}")
    
    return cleaned_reviews

# Ejemplo de uso
file_path = '../datasets/raw/user_reviews.json.gz'  # Reemplaza con la ruta real de tu archivo comprimido
reviews = read_and_clean_reviews(file_path)

# Crear una lista de tuplas con la información necesaria
data = []
for review in reviews:
    user_id = review['user_id']
    user_url = review['user_url']
    user_reviews = review.get('reviews', [])
    
    for user_review in user_reviews:
        item_id = user_review.get('item_id', '')
        recommend = user_review.get('recommend', False)
        review_text = user_review.get('review', '')
        funny = user_review.get('funny', '')
        posted = user_review.get('posted', '')
        last_edited = user_review.get('last_edited', '')
        helpful = user_review.get('helpful', '')

        data.append((user_id, user_url, item_id, recommend, review_text, funny, posted, last_edited, helpful))

# Crear el esquema del DataFrame
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("user_url", StringType(), True),
    StructField("item_id", StringType(), True),
    StructField("recommend", BooleanType(), True),
    StructField("review", StringType(), True),
    StructField("funny", StringType(), True),
    StructField("posted", StringType(), True),
    StructField("last_edited", StringType(), True),
    StructField("helpful", StringType(), True)
])

# Crear el DataFrame de PySpark
df = spark.createDataFrame(data, schema=schema)

# Mostrar el DataFrame
df.show(truncate=False)


In [None]:
# Selecciona la columna que quieres contar
columna = "user_id"  # Reemplaza con el nombre real de tu columna
# Toma una muestra del 10% de las filas para esa columna
sampled_count = df.select(columna).sample(fraction=0.1, seed=42).agg(count("*")).collect()[0][0]
print((sampled_count, len(df.columns)))

### Taxonomía de Datos Analíticos `user_reviews.json.gz`
- Se estructuro a 60'380 Registos y 9 Variables
- Taxonomía original:

#### user_reviews.gz.json

| Variable   | Descripción                              | Ejemplo                                                                                                                                            |
|-----------|------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|
| user_id   | Identificador único de usuario            | [76561197970982479, evcentric, maplemage]                                                                                                          |
| user_url  | URL del perfil del usuario                | [http://steamcommunity.com/id/evcentric]                                                                                                          |
| reviews   | Review de usuario en formato JSON         | {'funny': '', 'posted': 'Posted September 8, 2013.', 'last_edited': '', 'item_id': '227300', 'helpful': '0 of 1 people (0%) found this review helpful', 'recommend': True, 'review': "For a simple (it's actually not all that simple but it can be!) truck driving Simulator, it is quite a fun and relaxing game. Playing on simple (or easy?) its just the basic WASD keys for driving but (if you want) the game can be much harder and realistic with having to manually change gears, much harder turning, etc. And reversing in this game is a ♥♥♥♥♥, as I imagine it would be with an actual truck. Luckily, you don't have to reverse park it but you get extra points if you do cause it is bloody hard. But this is surprisingly a nice truck driving game and I had a bit of fun with it."} |


## Importación de Datos `users_items.json.gz`

In [None]:

def clean_text(text):
    # Reemplazar caracteres no ASCII con un espacio en blanco
    cleaned_text = ''.join(char if ord(char) < 128 else ' ' for char in text)
    return cleaned_text

def read_and_clean_reviews(file_path):
    cleaned_reviews = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            try:
                # Limpiar caracteres no ASCII
                cleaned_line = clean_text(line)
                
                # Convertir la línea limpiada a un diccionario
                review_dict = eval(cleaned_line)
                
                # Agregar el diccionario limpiado a la lista de revisiones
                cleaned_reviews.append(review_dict)
            except Exception as e:
                print(f"Error al procesar la línea: {line}")
                print(f"Error: {e}")
    
    return cleaned_reviews

# Ejemplo de uso
file_path = '../datasets/raw/users_items.json.gz'  # Reemplaza con la ruta real de tu archivo comprimido
items = read_and_clean_reviews(file_path)

# Crear una lista de tuplas con la información necesaria
data = []
for item in items:
    user_id = item.get('user_id', '')
    items_count = item.get('items_count', '')
    steam_id = item.get('steam_id', '')
    user_url = item.get('user_url', '')
    user_items = item.get('items', [])
    
    for user_item in user_items:
        item_id = user_item.get('item_id', '')
        item_name = user_item.get('item_name', '')
        playtime_forever = user_item.get('playtime_forever', '')
        playtime_2weeks = user_item.get('playtime_2weeks', '')

        data.append((user_id, items_count, steam_id, user_url, item_id, item_name, playtime_forever, playtime_2weeks))

# Crear el esquema del DataFrame
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("items_count", StringType(), True),
    StructField("steam_id", StringType(), True),
    StructField("user_url", StringType(), True),
    StructField("item_id", StringType(), True),
    StructField("item_name", StringType(), True),
    StructField("playtime_forever", StringType(), True),
    StructField("playtime_2weeks", StringType(), True)
])

# Crear el DataFrame de PySpark
df2 = spark.createDataFrame(data, schema=schema)

# Mostrar el DataFrame
#df2.show(5, truncate=False)


In [None]:
"""
review_dict = {k: None if isinstance(v, float) and math.isnan(v) else v for k, v in review_dict.items()}
 

     # Convertir listas a cadenas separadas por comas
    tags_str = ', '.join(map(str, tags)) if tags else ''
    specs_str = ', '.join(map(str, specs)) if specs else ''
    genres_str = ', '.join(map(str, genres)) if genres else ''

""" 

In [None]:
# Selecciona la columna que quieres contar
columna = "user_id"  # Reemplaza con el nombre real de tu columna

# Toma una muestra del 10% de las filas para esa columna
sampled_count = df.select(columna).sample(fraction=0.01, seed=42).agg(count("*")).collect()[0][0]

print((sampled_count, len(df.columns)))

### Taxonomía de Datos Analíticos `users_items.json.gz`
- Se estructuro a 63'200 Registos y 9 Variables
- Taxonomía original:


#### user_items.gz.json

| Columna | Descripción | Ejemplo |
|---------|-------------|---------|
| user_id | Identificador único de usuario | [76561197970982479, evcentric, maplemage] |
| user_url | URL del perfil del usuario | [http://steamcommunity.com/id/evcentric](http://steamcommunity.com/id/evcentric) |
| items | Items de usuario en formato Json | {'item_id': '273350', 'item_name': 'Evolve Stage 2', 'playtime_forever': 58, 'playtime_2weeks': 0} |


## Importación de Datos `steam_games.json.gz`

In [None]:
def clean_text(text):
    # Reemplazar caracteres no ASCII con un espacio en blanco
    cleaned_text = ''.join(char if ord(char) < 128 else ' ' for char in text)
    return cleaned_text

def read_and_clean_reviews(file_path):
    cleaned_reviews = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            try:
                # Limpiar caracteres no ASCII
                cleaned_line = clean_text(line)
                
                # Convertir la línea limpiada a un diccionario
                review_dict = json.loads(cleaned_line)
                
                # Reemplazar NaN con cadena vacía
                review_dict = {k: '' if isinstance(v, float) and math.isnan(v) else v for k, v in review_dict.items()}
                
                # Agregar el diccionario limpiado a la lista de revisiones
                cleaned_reviews.append(review_dict)
            except Exception as e:
                print(f"Error al procesar la línea: {line}")
                print(f"Error: {e}")
    
    return cleaned_reviews


# Ruta al archivo comprimido
file_path = '../datasets/raw/steam_games.json.gz'  # Reemplaza con la ruta real de tu archivo comprimido
games = read_and_clean_reviews(file_path)

# Crear una lista de tuplas con la información necesaria
data = []
for game in games:
    publisher = game.get('publisher', '')
    app_name = game.get('app_name', '')
    title = game.get('title', '')
    url = game.get('url', '')
    release_date = game.get('release_date', '')
    tags = game.get('tags', [])
    reviews_url = game.get('reviews_url', '')
    specs = game.get('specs', [])
    price = game.get('price', '')
    early_access = game.get('early_access', '')
    game_id = game.get('id', '')
    developer = game.get('developer', '')
    genres = game.get('genres', [])
    
    # Convertir listas a cadenas separadas por comas
    tags_str = ', '.join(map(str, tags))
    specs_str = ', '.join(map(str, specs))
    genres_str = ', '.join(map(str, genres))

    data.append((publisher, app_name, title, url, release_date, tags_str, reviews_url, specs_str,
                 price, early_access, game_id, developer, genres_str))

# Crear el esquema del DataFrame
schema = StructType([
    StructField("publisher", StringType(), True),
    StructField("app_name", StringType(), True),
    StructField("title", StringType(), True),
    StructField("url", StringType(), True),
    StructField("release_date", StringType(), True),
    StructField("tags_str", StringType(), True),
    StructField("reviews_url", StringType(), True),
    StructField("specs_str", StringType(), True),
    StructField("price", StringType(), True),
    StructField("early_access", StringType(), True),
    StructField("game_id", StringType(), True),
    StructField("developer", StringType(), True),
    StructField("genres_str", StringType(), True),
])

# Crear el DataFrame de PySpark
df3 = spark.createDataFrame(data, schema=schema)

# Mostrar el DataFrame
df3.show(5, truncate=False)

# Detener la sesión de Spark
#spark.stop()

In [None]:
# Selecciona la columna que quieres contar
columna = "publisher"  # Reemplaza con el nombre real de tu columna

# Toma una muestra del 10% de las filas para esa columna
sampled_count = df3.select(columna).sample(fraction=0.01, seed=42).agg(count("*")).collect()[0][0]

print((sampled_count, len(df3.columns)))

In [None]:
df3.distinct().show(20, truncate=False)

### Taxonomía de Datos Analíticos `steam_games.json.gz`
- Se estructuro a 128'100 Registos y 13 Variables
- Se validó que la variable `metascore` no existe en el set de datos origen  
- Taxonomía original:

#### steam_games.gz.json

| Variable          | Descripción                                      | Ejemplo                                                                                                            |
|-------------------|--------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
| publisher         | Empresa publicadora del contenido               | [Ubisoft, Dovetail Games - Trains, Degica]                                                                         |
| genres            | Género del contenido                             | [Action, Adventure, Racing, Simulation, Strategy]                                                                   |
| app_name          | Nombre del contenido                             | [Warzone, Soundtrack, Puzzle Blocks]                                                                                |
| title             | Título del contenido                              | [The Dream Machine: Chapter 4, Fate/EXTELLA - Sweet Room Dream, Fate/EXTELLA - Charming Bunny]                       |
| url               | URL de publicación del contenido                | [http://store.steampowered.com/app/761140/Lost_Summoner_Kitty/]                                                    |
| release_date      | Fecha de lanzamiento                              | [2018-01-04]                                                                                                       |
| tags              | Etiquetas de contenido                           | [Simulation, Indie, Action, Adventure, Funny, Open World, First-Person, Sandbox, Free to Play]                      |
| discount_price    | Precio de descuento                               | [22.66, 0.49, 0.69]                                                                                                |
| reviews_url       | Reviews de contenido                              | [http://steamcommunity.com/app/681550/reviews/?browsefilter=mostrecent&p=1]                                        |
| specs             | Especificaciones                                 | [Multi-player, Co-op, Cross-Platform Multiplayer, Downloadable Content]                                            |
| price             | Precio del contenido                              | [4.99, 9.99, Free to Use, Free to Play]                                                                             |
| early_access      | Acceso temprano                                  | [False, True]                                                                                                      |
| id                | Identificador único de contenido                 | [761140, 643980, 670290]                                                                                           |
| developer         | Desarrollador                                    | [Kotoshiro, Secret Level SRL, Poolians.com]                                                                        |
| metascore         | Score por Metacritic                             | [80, 74, 77, 75]                                                                                                   |


## Exportación Modelo de Datos a Parquet

In [None]:
# Guardar el DataFrame en formato Parquet
# Ruta de destino para el archivo Parquet
ruta_destino_parquet_i = '../datasets/raw/user_reviews_parquet'
df.write.mode("overwrite").parquet(ruta_destino_parquet_i)

In [None]:
# Guardar el DataFrame en formato Parquet
# Ruta de destino para el archivo Parquet
ruta_destino_parquet_ii = '../datasets/raw/users_items_parquet'
df2.write.mode("overwrite").parquet(ruta_destino_parquet_ii)

In [None]:
# Guardar el DataFrame en formato Parquet
# Ruta de destino para el archivo Parquet
ruta_destino_parquet_iii = '../datasets/raw/steam_games_parquet'
df3.write.mode("overwrite").parquet(ruta_destino_parquet_iii)

## Exploratorio de la información EDA 

- **Importación de Modelo de datos sin cambios**

In [104]:
# Leer el archivo Parquet
steam_games = spark.read.parquet("../datasets/raw/steam_games_parquet")
user_reviews = spark.read.parquet("../datasets/raw/user_reviews_parquet")
users_items = spark.read.parquet("../datasets/raw/users_items_parquet")

- **Renombrar a las columnas**

In [105]:
# Definir la función para limpiar y formatear los nombres de las columnas
def limpiar_nombres_columnas(df):
    # Función para limpiar y formatear un solo nombre de columna
    def limpiar_nombre(nombre):
        # Eliminar caracteres especiales y espacios
        nombre = re.sub(r'\W+', '_', nombre)
        # Eliminar acentos
        nombre = unidecode(nombre)
        # Convertir a minúsculas
        nombre = nombre.lower()
        return nombre
    
    # Aplicar la función a cada nombre de columna
    nuevos_nombres = [limpiar_nombre(col) for col in df.columns]
    
    # Renombrar las columnas en el DataFrame
    for viejo, nuevo in zip(df.columns, nuevos_nombres):
        df = df.withColumnRenamed(viejo, nuevo)
    
    return df

# Aplicar la función a cada DataFrame
steam_games = limpiar_nombres_columnas(steam_games)
user_reviews = limpiar_nombres_columnas(user_reviews)
users_items = limpiar_nombres_columnas(users_items)

- **Propuesta de Modelo de datos sin cambios**

In [6]:
print("Modelo de datos de: steam_games ")
print("Cantidad de registros :", steam_games.count(), "Cantidad de las Columnas :", len(steam_games.columns))
print("\n")
steam_games.printSchema()
print("\n")
steam_games.show(2)

Modelo de datos de: steam_games 
Cantidad de registros : 120445 Cantidad de las Columnas : 13


root
 |-- publisher: string (nullable = true)
 |-- app_name: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- tags_str: string (nullable = true)
 |-- reviews_url: string (nullable = true)
 |-- specs_str: string (nullable = true)
 |-- price: string (nullable = true)
 |-- early_access: string (nullable = true)
 |-- game_id: string (nullable = true)
 |-- developer: string (nullable = true)
 |-- genres_str: string (nullable = true)



+-------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----+------------+-------+-------------------+--------------------+
|          publisher|            app_name|               title|                 url|release_date|            tags_str|         reviews_url|    

In [7]:
print("Modelo de datos de: user_reviews ")
print("Cantidad de registros:", user_reviews.count(), "Cantidad de las Columnas:", len(user_reviews.columns))
print("\n")
user_reviews.printSchema()
print("\n")
user_reviews.show(2, truncate=False)

Modelo de datos de: user_reviews 
Cantidad de registros: 59305 Cantidad de las Columnas: 9


root
 |-- user_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- recommend: boolean (nullable = true)
 |-- review: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- posted: string (nullable = true)
 |-- last_edited: string (nullable = true)
 |-- helpful: string (nullable = true)



+-------------------+------------------------------------------------+-------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
print("Modelo de datos de: users_items ")
print("Cantidad de registros", users_items.count(), "Cantidad de las Columnas", len(users_items.columns))
print("\n")
users_items.printSchema()
print("\n")
users_items.show(2, truncate=False)

Modelo de datos de: users_items 
Cantidad de registros 5153209 Cantidad de las Columnas 8


root
 |-- user_id: string (nullable = true)
 |-- items_count: string (nullable = true)
 |-- steam_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- playtime_forever: string (nullable = true)
 |-- playtime_2weeks: string (nullable = true)



+-----------------+-----------+-----------------+----------------------------------------------------+-------+---------------------+----------------+---------------+
|user_id          |items_count|steam_id         |user_url                                            |item_id|item_name            |playtime_forever|playtime_2weeks|
+-----------------+-----------+-----------------+----------------------------------------------------+-------+---------------------+----------------+---------------+
|76561197971591953|11         |76561197971591953|http://steamcomm

In [93]:
# Aplicar la función a cada DataFrame
steam_games = limpiar_nombres_columnas(steam_games)
user_reviews = limpiar_nombres_columnas(user_reviews)
users_items = limpiar_nombres_columnas(users_items)

## Preprocesamiento de Datos
- Nulos, vacios, carcteres en blanco y/o caracteres especiales.

In [94]:
# Define the function to count null or empty values for each column
def count_null_values(df):
    return df.select(
        *[sum(coalesce(col(c) == "", col(c).isNull()).cast("int")).alias(c) for c in df.columns]
    )

# Clean and count null values for each DataFrame
steam_games = limpiar_nombres_columnas(steam_games)
valores_vacios_por_columna_steam = count_null_values(steam_games)
valores_vacios_por_columna_steam.show()

user_reviews = limpiar_nombres_columnas(user_reviews)
valores_vacios_por_columna_reviews = count_null_values(user_reviews)
valores_vacios_por_columna_reviews.show()

users_items = limpiar_nombres_columnas(users_items)
valores_vacios_por_columna_items = count_null_values(users_items)
valores_vacios_por_columna_items.show()

+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+
|publisher|app_name|title|  url|release_date|tags_str|reviews_url|specs_str|price|early_access|game_id|developer|genres_str|
+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+
|    96362|   88312|90360|88310|       90377|   88473|      88312|    88980|89687|       88310|  88312|    91609|     91593|
+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+

+-------+--------+-------+---------+------+-----+------+-----------+-------+
|user_id|user_url|item_id|recommend|review|funny|posted|last_edited|helpful|
+-------+--------+-------+---------+------+-----+------+-----------+-------+
|      0|       0|      0|        0|    30|51154|     0|      53165|      0|
+-------+--------+-------+---------+------+-----+------+----------

- **Valores únicos Price `steam_games`**

In [106]:
# Get unique values in the "price" column as a list
unique_prices = steam_games.select("price").distinct().orderBy("price").collect()

# Display the unique values as an array
price_array = [row.price for row in unique_prices]
print(price_array)

['', '0.49', '0.5', '0.89', '0.95', '0.98', '0.99', '1.0', '1.25', '1.29', '1.39', '1.49', '1.5', '1.59', '1.87', '1.95', '1.99', '10.0', '10.49', '10.93', '10.96', '10.99', '109.99', '11.15', '11.99', '119.99', '12.0', '12.89', '12.99', '124.99', '129.99', '13.37', '13.98', '13.99', '131.4', '139.92', '14.95', '14.99', '149.99', '15.0', '15.99', '16.06', '16.99', '160.91', '17.99', '172.24', '179.0', '18.9', '18.99', '189.0', '189.96', '19.29', '19.95', '19.98', '19.99', '199.0', '199.99', '2.0', '2.3', '2.49', '2.66', '2.89', '2.97', '2.99', '20.0', '20.99', '202.76', '21.99', '22.99', '23.96', '23.99', '234.99', '24.99', '249.99', '26.99', '27.49', '27.99', '289.99', '29.96', '29.99', '299.99', '3.0', '3.33', '3.39', '3.49', '3.99', '30.0', '31.99', '32.99', '320.0', '34.99', '36.99', '38.85', '39.99', '399.0', '399.99', '4.0', '4.29', '4.49', '4.68', '4.99', '40.0', '41.99', '42.99', '44.98', '44.99', '49.0', '49.99', '499.99', '5.0', '5.49', '5.65', '5.99', '54.99', '59.95', '59.9

- **Estrategia de tratamiento de nulos, vacios, caracteres especiales Price `steam_games`**

In [122]:
# Tratar los valores en la columna 'price'
steam_games = steam_games.withColumn(
    "price",
    when(steam_games["price"].isin("", "Free", "Free Demo", "Free HITMAN™ Holiday Pack", "Free Mod", "Free Movie", "Free To Play", "Free to Play", "Free to Try", "Free to Use", "Install Now", "Install Theme", "Play Now", "Play WARMACHINE: Tactics Demo", "Play for Free!", "Play the Demo", "Third-party"), -1.0)
    .otherwise(regexp_extract(steam_games["price"], r'\d+(\.\d+)?', 0).cast("float"))
)

# Mostrar el DataFrame resultante
steam_games.show(truncate=False)
unique_prices = steam_games.select("price").distinct().orderBy("price").collect()
# Convertir a array
price_array = [row.price for row in unique_prices]
print(price_array)

+-----------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------+-------+-----------------------------+--------------------------------------------------------------------+
|publisher              |app_name                                             

- **Validación tratamiento para Price `steam_games`**

In [124]:
# Clean and count null values for each DataFrame
steam_games_transformed = limpiar_nombres_columnas(steam_games)
valores_vacios_por_columna_steam = count_null_values(steam_games_transformed)
valores_vacios_por_columna_steam.show()

+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+
|publisher|app_name|title|  url|release_date|tags_str|reviews_url|specs_str|price|early_access|game_id|developer|genres_str|
+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+
|    96362|   88312|90360|88310|       90377|   88473|      88312|    88980|    0|       88310|  88312|    91609|     91593|
+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+



- **tratamiento para release_date creación de year,early_access `steam_games`**

In [114]:
# Get unique values in the "release_date" column as a list
unique_dates = steam_games.select("release_date").distinct().orderBy("release_date")

# Convert to list
date_list = [row.release_date for row in unique_dates.collect()]

# Display the unique values as an array
print(date_list)


['', '"""Soon"""', '0̵1̴0̵0̶1̷0̶0̵0̴ ̴0̶0̶1̶1̶0̷0̶1̵1̴ ̸0̶0̶1̶1̵0̶1̷0̴0̵ ̴0̶1̷0̸1̵0̷0̴1̶0̴ ̴0̷0̴1̷1̶0̶1̵1̷1̵ ̵', '14 July', '15.01.2018', '1970-07-15', '1970-12-16', '1975-12-31', '1980-01-01', '1981-01-01', '1981-04-22', '1982-01-01', '1982-05-21', '1982-12-31', '1983-01-01', '1983-06-19', '1983-11-01', '1984-01-01', '1984-03-02', '1984-04-29', '1984-11-01', '1985-01-01', '1985-07-10', '1985-10-16', '1986-05-01', '1987-01-01', '1987-04-17', '1987-08-21', '1987-09-07', '1987-09-30', '1987-10-05', '1987-11-06', '1988-01-01', '1988-04-16', '1988-05-01', '1988-06-01', '1988-09-23', '1988-12-23', '1989-01-01', '1989-04-05', '1989-07-01', '1989-07-14', '1989-10-01', '1990-01-01', '1990-09-07', '1990-12-14', '1990-12-19', '1991-01-01', '1991-01-02', '1991-02-01', '1991-06-16', '1991-07-01', '1991-10-01', '1991-10-09', '1991-10-23', '1991-12-09', '1992-01-01', '1992-02-01', '1992-03-01', '1992-05-01', '1992-06-01', '1992-10-01', '1992-10-23', '1992-11-01', '1992-11-17', '1993-01-01', '1993-01

In [137]:


# Tratar los valores en la columna 'release_date'
steam_games_transformed = steam_games.withColumn(
    "release_date",
    when(
        (steam_games["release_date"] == "") | 
        (steam_games["release_date"] == '"""Soon"""') | 
        (steam_games["release_date"].rlike('[^\d-]')), 
        '1970-01-01'
    )
    .otherwise(steam_games["release_date"])
)

# Extraer los años de la columna 'release_date' y crear la columna 'year'
steam_games_transformed = steam_games_transformed.withColumn(
    "year",
    when(
        (steam_games_transformed["release_date"] != "") & (steam_games_transformed["release_date"].rlike(r"(\d{4})")),
        regexp_extract(steam_games_transformed["release_date"], r"(\d{4})", 1).cast(IntegerType())
    )
    .otherwise(
        when(
            (steam_games_transformed["release_date"].like("%soon%") | steam_games_transformed["release_date"].like("%Soon%")),
            "TBD"
        )
        .otherwise("Unknown")
    )
)

# Convertir la columna 'year' a tipo entero
steam_games_transformed = steam_games_transformed.withColumn("year", steam_games_transformed["year"].cast(IntegerType()))

# Mostrar el DataFrame resultante
steam_games_transformed.show(truncate=False)

# Obtener valores únicos de la columna 'year' como lista
unique_years = steam_games_transformed.select("year").distinct().orderBy("year")
# Convertir a lista
year_list = [row.year for row in unique_years.collect()]
# Mostrar los valores únicos como un array
print(year_list)



+-----------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------+-------+-----------------------------+--------------------------------------------------------------------+----+
|publisher              |app_name                                        

In [134]:
# Clean and count null values for each DataFrame
steam_games_transformed = limpiar_nombres_columnas(steam_games_transformed)
valores_vacios_por_columna_steam = count_null_values(steam_games_transformed)
valores_vacios_por_columna_steam.show()

+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+----+
|publisher|app_name|title|  url|release_date|tags_str|reviews_url|specs_str|price|early_access|game_id|developer|genres_str|year|
+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+----+
|    96362|   88312|90360|88310|           0|   88473|      88312|    88980|    0|       88310|  88312|    91609|     91593|   0|
+---------+--------+-----+-----+------------+--------+-----------+---------+-----+------------+-------+---------+----------+----+



In [139]:
# Get unique values in the "release_date" column as a list
unique_dates = steam_games.select("early_access").distinct().orderBy("early_access")

# Convert to list
date_list = [row.early_access for row in unique_dates.collect()]

# Display the unique values as an array
print(date_list)


['', 'false', 'true']


In [140]:
# Tratar los valores en la columna 'early_access'
steam_games_transformed = steam_games_transformed.withColumn(
    "early_access",
    when(steam_games_transformed["early_access"] == "", -1)
    .when(steam_games_transformed["early_access"] == "false", 0)
    .when(steam_games_transformed["early_access"] == "true", 1)
    .otherwise("Unknown")
)

# Mostrar el DataFrame resultante
steam_games_transformed.show(truncate=False)

# Obtener valores únicos de la columna 'early_access' como lista
unique_access_values = steam_games_transformed.select("early_access").distinct().orderBy("early_access")
# Convertir a lista
access_list = [row.early_access for row in unique_access_values.collect()]
# Mostrar los valores únicos como un array
print(access_list)


# Paso 1: Reemplazar cadenas vacías por 'Unknown'
steam_games_transformed = steam_games_transformed.withColumn("developer",
                                           when(col("developer") == "", "Unknown")
                                           .otherwise(col("developer")))

# Paso 3: Eliminar caracteres especiales y limpiar acentos
steam_games_transformed = steam_games_transformed.withColumn("developer",
                                           lower(regexp_replace(trim(col("developer")),
                                                               "[^a-zA-Z0-9\s]", "")))

+-----------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+------------+-------+-----------------------------+--------------------------------------------------------------------+----+
|publisher              |app_name                                        

In [211]:
steam_games_transformed.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- recommend: integer (nullable = true)
 |-- review: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- posted: string (nullable = true)
 |-- last_edited: string (nullable = true)
 |-- helpful: string (nullable = true)
 |-- posted_date: date (nullable = true)
 |-- clean_posted: string (nullable = true)
 |-- clean_date: string (nullable = true)



In [None]:
genres_str

In [209]:
# Obtener valores únicos de la columna 'genres_str' como lista
unique_genres_values = steam_games_transformed.select("genres_str").distinct().orderBy("genres_str")

# Convertir a lista
genres_list = [row.genres_str for row in unique_genres_values.collect()]

# Mostrar los valores únicos como un array
print(genres_list)


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `genres_str` cannot be resolved. Did you mean one of the following? [`item_id`, `review`, `user_id`, `posted`, `user_url`].;
'Project ['genres_str]
+- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#20750, posted#20763, last_edited#20776, helpful#20789, posted_date#20802, clean_posted#20815, clean_date#20596 AS clean_date#20828]
   +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#20750, posted#20763, last_edited#20776, helpful#20789, posted_date#20802, clean_posted#20584 AS clean_posted#20815, clean_date#20596]
      +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#20750, posted#20763, last_edited#20776, helpful#20789, posted_date#20610 AS posted_date#20802, clean_posted#20584, clean_date#20596]
         +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#20750, posted#20763, last_edited#20776, helpful#13062 AS helpful#20789, posted_date#20610, clean_posted#20584, clean_date#20596]
            +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#20750, posted#20763, last_edited#13052 AS last_edited#20776, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
               +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#20750, posted#13042 AS posted#20763, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                  +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20737, funny#13032 AS funny#20750, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                     +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#20724, review#20623 AS review#20737, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                        +- Project [user_id#20685, user_url#20698, item_id#20711, recommend#16389 AS recommend#20724, review#20623, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                           +- Project [user_id#20685, user_url#20698, item_id#13002 AS item_id#20711, recommend#16389, review#20623, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                              +- Project [user_id#20685, user_url#12992 AS user_url#20698, item_id#13002, recommend#16389, review#20623, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                                 +- Project [user_id#12981 AS user_id#20685, user_url#12992, item_id#13002, recommend#16389, review#20623, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                                    +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, CASE WHEN (isnull(review#13022) OR (review#13022 = )) THEN Unknown ELSE review#13022 END AS review#20623, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#20610, clean_posted#20584, clean_date#20596]
                                       +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062, cast(to_date(clean_date#20596, Some(MMM d, yyyy.), Some(America/Bogota), false) as date) AS posted_date#20610, clean_posted#20584, clean_date#20596]
                                          +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#16637, clean_posted#20584, trim(concat(regexp_replace(clean_posted#20584, \.,, ,, 1), .), None) AS clean_date#20596]
                                             +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#16637, CASE WHEN Contains(clean_posted#20572, ,) THEN clean_posted#20572 ELSE concat(clean_posted#20572, , 2024) END AS clean_posted#20584]
                                                +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#16637, trim(clean_posted#20559, None) AS clean_posted#20572]
                                                   +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062, posted_date#16637, regexp_replace(posted#13042, Posted\s*, , 1) AS clean_posted#20559]
                                                      +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062, regexp_extract(posted#13042, (Posted )?(\w+ \d+, \d{4})\.?, 2) AS posted_date#16637]
                                                         +- Project [user_id#12981, user_url#12992, item_id#13002, cast(recommend#16379 as int) AS recommend#16389, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062]
                                                            +- Project [user_id#12981, user_url#12992, item_id#13002, CASE WHEN (recommend#13012 = true) THEN 1 ELSE CASE WHEN (recommend#13012 = false) THEN 0 END END AS recommend#16379, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#13062]
                                                               +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#13012, review#13022, funny#13032, posted#13042, last_edited#13052, helpful#12772 AS helpful#13062]
                                                                  +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#13012, review#13022, funny#13032, posted#13042, last_edited#12771 AS last_edited#13052, helpful#12772]
                                                                     +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#13012, review#13022, funny#13032, posted#12770 AS posted#13042, last_edited#12771, helpful#12772]
                                                                        +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#13012, review#13022, funny#12769 AS funny#13032, posted#12770, last_edited#12771, helpful#12772]
                                                                           +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#13012, review#12768 AS review#13022, funny#12769, posted#12770, last_edited#12771, helpful#12772]
                                                                              +- Project [user_id#12981, user_url#12992, item_id#13002, recommend#12767 AS recommend#13012, review#12768, funny#12769, posted#12770, last_edited#12771, helpful#12772]
                                                                                 +- Project [user_id#12981, user_url#12992, item_id#12766 AS item_id#13002, recommend#12767, review#12768, funny#12769, posted#12770, last_edited#12771, helpful#12772]
                                                                                    +- Project [user_id#12981, user_url#12765 AS user_url#12992, item_id#12766, recommend#12767, review#12768, funny#12769, posted#12770, last_edited#12771, helpful#12772]
                                                                                       +- Project [user_id#12764 AS user_id#12981, user_url#12765, item_id#12766, recommend#12767, review#12768, funny#12769, posted#12770, last_edited#12771, helpful#12772]
                                                                                          +- Relation [user_id#12764,user_url#12765,item_id#12766,recommend#12767,review#12768,funny#12769,posted#12770,last_edited#12771,helpful#12772] parquet


#### Tratamiento `user_reviews` 

In [142]:
print("Modelo de datos de: user_reviews ")
print("Cantidad de registros:", user_reviews.count(), "Cantidad de las Columnas:", len(user_reviews.columns))
print("\n")
user_reviews.printSchema()
print("\n")
user_reviews.show(2, truncate=False)

Modelo de datos de: user_reviews 
Cantidad de registros: 59305 Cantidad de las Columnas: 9


root
 |-- user_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- recommend: boolean (nullable = true)
 |-- review: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- posted: string (nullable = true)
 |-- last_edited: string (nullable = true)
 |-- helpful: string (nullable = true)



+-------------------+------------------------------------------------+-------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [143]:
# Obtener valores únicos de la columna 'early_access' como lista
unique_access_values = user_reviews.select("recommend").distinct().orderBy("recommend")
# Convertir a lista
access_list = [row.recommend for row in unique_access_values.collect()]
# Mostrar los valores únicos como un array
print(access_list)

[False, True]


In [144]:
# Función para tratar la columna 'recommend'
def transform_recommend_column(df):
    df_transformed = df.withColumn(
        "recommend",
        when(df["recommend"] == True, 1)
        .otherwise(when(df["recommend"] == False, 0))
    ).withColumn("recommend", col("recommend").cast(IntegerType()))  # Convertir a tipo entero
    return df_transformed

In [145]:
# Aplicar la función al DataFrame de User Reviews
user_reviews_transformed = transform_recommend_column(user_reviews)

# Mostrar el DataFrame resultante
user_reviews_transformed.show(truncate=False)

+-------------------+----------------------------------------------------+-------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [146]:
user_reviews_transformed.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- recommend: integer (nullable = true)
 |-- review: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- posted: string (nullable = true)
 |-- last_edited: string (nullable = true)
 |-- helpful: string (nullable = true)



In [148]:
# Obtener valores únicos de la columna 'posted'
unique_posted_values = user_reviews_transformed.select("posted").distinct().orderBy("posted")
# Convertir a lista
posted_list = [row.posted for row in unique_posted_values.collect()]
# Mostrar los valores únicos como un array
print(posted_list)


['Posted April 1, 2012.', 'Posted April 1, 2013.', 'Posted April 1, 2014.', 'Posted April 1, 2015.', 'Posted April 1.', 'Posted April 10, 2012.', 'Posted April 10, 2013.', 'Posted April 10, 2014.', 'Posted April 10, 2015.', 'Posted April 10.', 'Posted April 11, 2012.', 'Posted April 11, 2013.', 'Posted April 11, 2014.', 'Posted April 11, 2015.', 'Posted April 11.', 'Posted April 12, 2012.', 'Posted April 12, 2013.', 'Posted April 12, 2014.', 'Posted April 12, 2015.', 'Posted April 12.', 'Posted April 13, 2012.', 'Posted April 13, 2013.', 'Posted April 13, 2014.', 'Posted April 13, 2015.', 'Posted April 13.', 'Posted April 14, 2012.', 'Posted April 14, 2013.', 'Posted April 14, 2014.', 'Posted April 14, 2015.', 'Posted April 14.', 'Posted April 15, 2011.', 'Posted April 15, 2012.', 'Posted April 15, 2013.', 'Posted April 15, 2014.', 'Posted April 15, 2015.', 'Posted April 15.', 'Posted April 16, 2012.', 'Posted April 16, 2013.', 'Posted April 16, 2014.', 'Posted April 16, 2015.', 'Poste

In [170]:
# Configura la propiedad para el analizador de fecha
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [204]:
from pyspark.sql.functions import regexp_replace, to_date, trim, when, expr, concat, lit, col
from pyspark.sql.types import StringType, DateType
from datetime import datetime

# Obtén el año actual
current_year = datetime.now().year

# Define el formato de fecha que parece tener el campo 'posted'
date_format = 'MMM d, yyyy.'

# Limpia y estandariza el campo 'posted' utilizando expresiones regulares
user_reviews_transformed = user_reviews_transformed.withColumn(
    "clean_posted",
    regexp_replace("posted", r'Posted\s*', '')  # Elimina 'Posted'
)

# Elimina espacios en blanco adicionales
user_reviews_transformed = user_reviews_transformed.withColumn(
    "clean_posted",
    trim(user_reviews_transformed["clean_posted"]).alias("clean_posted")
)

# Agrega el año actual a las fechas que no tienen año
user_reviews_transformed = user_reviews_transformed.withColumn(
    "clean_posted",
    when(user_reviews_transformed["clean_posted"].contains(","), user_reviews_transformed["clean_posted"]).otherwise(expr('concat(clean_posted, ", {}")'.format(current_year)))
)

# Elimina el punto intermedio y agrega un punto al final
user_reviews_transformed = user_reviews_transformed.withColumn(
    "clean_date",
    trim(concat(regexp_replace("clean_posted", r'\.,', ','), lit('.')))
)

# Convierte el campo 'clean_date' a formato de fecha utilizando to_date
user_reviews_transformed = user_reviews_transformed.withColumn(
    "posted_date",
    to_date("clean_date", date_format).cast(DateType()).alias("posted_date")
)

# Reemplaza las cadenas nulas o vacías en la columna 'review' con 'Unknown'
user_reviews_transformed = user_reviews_transformed.withColumn("review", when((col("review").isNull()) | (col("review") == ""), "Unknown").otherwise(col("review")))

# Muestra el DataFrame resultante
user_reviews_transformed.show(truncate=False)

+-------------------+----------------------------------------------------+-------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [205]:
user_reviews_transformed.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_url: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- recommend: integer (nullable = true)
 |-- review: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- posted: string (nullable = true)
 |-- last_edited: string (nullable = true)
 |-- helpful: string (nullable = true)
 |-- posted_date: date (nullable = true)
 |-- clean_posted: string (nullable = true)
 |-- clean_date: string (nullable = true)



In [206]:
# Clean and count null values for each DataFrame
steam_games_transformed = limpiar_nombres_columnas(user_reviews_transformed)
valores_vacios_por_columna_steam = count_null_values(steam_games_transformed)
valores_vacios_por_columna_steam.show()

+-------+--------+-------+---------+------+-----+------+-----------+-------+-----------+------------+----------+
|user_id|user_url|item_id|recommend|review|funny|posted|last_edited|helpful|posted_date|clean_posted|clean_date|
+-------+--------+-------+---------+------+-----+------+-----------+-------+-----------+------------+----------+
|      0|       0|      0|        0|     0|51154|     0|      53165|      0|          0|           0|         0|
+-------+--------+-------+---------+------+-----+------+-----------+-------+-----------+------------+----------+



In [None]:
# Tratar los valores en la columna 'early_access'
steam_games_transformed = steam_games_transformed.withColumn(
    "early_access",
    when(steam_games_transformed["early_access"] == "", -1)
    .when(steam_games_transformed["early_access"] == "false", 0)
    .when(steam_games_transformed["early_access"] == "true", 1)
    .otherwise("Unknown")
)

In [None]:
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        self.sentiment_column = None  # Nuevo atributo para almacenar la columna de sentimientos

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        analyzer = SentimentIntensityAnalyzer()
        X['sentiment_score'] = X[self.column].astype(str).apply(lambda x: analyzer.polarity_scores(x)['compound'])
        self.sentiment_column = X[['sentiment_score']].copy()  # Almacenar la columna de sentimientos
        return X

class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for col in self.columns:
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col])
        return X

# Obtener las columnas numéricas y categóricas
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
ordinal_var = ['recommend']
categorical_columns = ["genres_str", "app_name", "developer", "item_name"]
# Crear una nueva variable excluyendo las columnas en ordinal_var
categorical_col_excluded_ordinal = [col for col in categorical_columns if col not in ordinal_var]

# Convertir las columnas categóricas a tipo str
df[categorical_col_excluded_ordinal] = df[categorical_col_excluded_ordinal].astype(str)
# Crear una nueva variable excluyendo la columna 'review'

# Definir las transformaciones para las columnas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Definir las transformaciones para las columnas ordinales
ordinal_transformer = MultiColumnLabelEncoder(columns=ordinal_var)

# Definir las transformaciones para las columnas categóricas restantes
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crear la columna transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('ord', ordinal_transformer, ordinal_var),
        ('cat', categorical_transformer, categorical_col_excluded_ordinal)
    ])

# Añadir el transformador a la pipeline
pipeline = Pipeline(steps=[
    ('sentiment_analyzer', SentimentAnalyzer(column='review')),
    ('preprocessor', preprocessor)
])

# Aplicar la pipeline al DataFrame
transformed_df = pipeline.fit_transform(df)
# Obtener la columna de sentimientos directamente de SentimentAnalyzer
sentiment_transformed = pipeline.named_steps['sentiment_analyzer'].sentiment_column


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.sentiment import SentimentIntensityAnalyzer

class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sentiment_column = None  # Nuevo atributo para almacenar la columna de sentimientos

    def fit(self, X, y=None):
        return self

    def transform(self, X, column='review'):
        analyzer = SentimentIntensityAnalyzer()
        X['sentiment_score'] = X[column].astype(str).apply(lambda x: analyzer.polarity_scores(x)['compound'])
        self.sentiment_column = X[['sentiment_score']].copy()  # Almacenar la columna de sentimientos
        return X  # Devolver el DataFrame completo con la nueva columna

    def get_feature_names_out(self, input_features=None):
        return input_features
    
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.label_encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            le = LabelEncoder()
            le.fit(X[col])
            self.label_encoders[col] = le
        return self

    def transform(self, X):
        for col in self.columns:
            le = self.label_encoders[col]
            X[col] = le.transform(X[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features


# Obtener las columnas numéricas y categóricas
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
ordinal_var = ['recommend']
categorical_columns = ["genres_str", "app_name", "developer", "item_name"]
# Crear una nueva variable excluyendo las columnas en ordinal_var
categorical_col_excluded_ordinal = [col for col in categorical_columns if col not in ordinal_var]

# Convertir las columnas categóricas a tipo str
df[categorical_col_excluded_ordinal] = df[categorical_col_excluded_ordinal].astype(str)
# Crear una nueva variable excluyendo la columna 'review'

# Definir las transformaciones para las columnas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Definir las transformaciones para las columnas ordinales
ordinal_transformer = MultiColumnLabelEncoder(columns=ordinal_var)

# Definir las transformaciones para las columnas categóricas restantes
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crear la columna transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('ord', ordinal_transformer, ordinal_var),
        ('cat', categorical_transformer, categorical_col_excluded_ordinal)
    ])

# Añadir el transformador a la pipeline
pipeline = Pipeline(steps=[
    ('sentiment_analyzer', SentimentAnalyzer()),
    ('preprocessor', preprocessor)
])

# Aplicar la pipeline al DataFrame
transformed_df = pipeline.fit_transform(df)

# Convertir la matriz dispersa a un DataFrame de pandas
transformed_df_dense = pd.DataFrame(transformed_df.toarray(), columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

# Mostrar las primeras filas del DataFrame
transformed_df_dense.head(5)