<a href="https://colab.research.google.com/github/fransindi/recomendacion_amazon/blob/master/eda_toys_metatoys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=56bd04ff201d2ea96f2a0e51be3a36807760353845e9da219a1370759606e6d1
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from google.colab import drive

In [3]:
from pyspark.sql import SparkSession

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Crea una sesión de Spark
spark = SparkSession.builder.appName("Lectura_de_Archivo").getOrCreate()

# Ruta al archivo JSON dentro de la carpeta compartida
json_file_path = '/content/drive/My Drive/Version 2018/Toys_and_Games.json.gz'


In [6]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, BooleanType

In [7]:
custom_schema = StructType([
    StructField("asin", StringType(), nullable=True),
    StructField("image", StringType(), nullable=True),
    StructField("overall", FloatType(), nullable=True),
    StructField("reviewText", StringType(), nullable=True),
    StructField("reviewTime", StringType(), nullable=True),
    StructField("reviewerID", StringType(), nullable=True),
    StructField("reviewerName", StringType(), nullable=True),
    StructField("style", StringType(), nullable=True),
    StructField("summary", StringType(), nullable=True),
    StructField("unixReviewTime", IntegerType(), nullable=True),
    StructField("verified", BooleanType(), nullable=True),
    StructField("vote", StringType(), nullable=True)
])
df = spark.read.json(json_file_path, schema=custom_schema)

In [8]:
df.show()

+----------+--------------------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------------+--------------+--------+----+
|      asin|               image|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|               style|             summary|unixReviewTime|verified|vote|
+----------+--------------------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------------+--------------+--------+----+
|0020232233|                NULL|    2.0|When it comes to ...|09 22, 2016|A1IDMI31WEANAF|      Mackenzie Kent|                NULL|The fact that 50%...|    1474502400|   false|  12|
|0020232233|                NULL|    1.0|An Open Letter to...|09 18, 2016| A4BCEVVZ4Y3V3|  Jonathan Christian|                NULL|Another worthless...|    1474156800|   false|  21|
|0020232233|                NULL|    3.0|Nice art, nice pr...|09 12, 2016|A2EZ9PY1IHHBX0| 

In [9]:
df.count()

8201231

In [10]:
from pyspark.sql.functions import from_unixtime, year

# Convertir unixtime en tiempo y extraer el anio para realizar el filtrado

In [11]:
df = df.withColumn("review_date", from_unixtime(df["unixreviewtime"]))

In [12]:
df = df.withColumn("review_year", year(df["review_date"]))

In [13]:
df = df.drop("unixreviewtime", 'reviewTime')

In [14]:
df.show()

+----------+--------------------+-------+--------------------+--------------+--------------------+--------------------+--------------------+--------+----+-------------------+-----------+
|      asin|               image|overall|          reviewText|    reviewerID|        reviewerName|               style|             summary|verified|vote|        review_date|review_year|
+----------+--------------------+-------+--------------------+--------------+--------------------+--------------------+--------------------+--------+----+-------------------+-----------+
|0020232233|                NULL|    2.0|When it comes to ...|A1IDMI31WEANAF|      Mackenzie Kent|                NULL|The fact that 50%...|   false|  12|2016-09-22 00:00:00|       2016|
|0020232233|                NULL|    1.0|An Open Letter to...| A4BCEVVZ4Y3V3|  Jonathan Christian|                NULL|Another worthless...|   false|  21|2016-09-18 00:00:00|       2016|
|0020232233|                NULL|    3.0|Nice art, nice pr...|A2E

# Filtrado del df entre 2014 y 2004

In [15]:
# Filtrar las filas para incluir solo los años entre 2004 y 2014
df_filtrado = df.filter((df["review_year"] >= 2004) & (df["review_year"] <= 2014))

# Al filtrar entre 2004 y 2014 pasamos de 8M de registros a 2M de regsitros.

In [16]:
df_filtrado.count()

2393317

In [17]:
df_filtrado.show()

+----------+-----+-------+--------------------+--------------+--------------------+--------------------+--------------------+--------+----+-------------------+-----------+
|      asin|image|overall|          reviewText|    reviewerID|        reviewerName|               style|             summary|verified|vote|        review_date|review_year|
+----------+-----+-------+--------------------+--------------+--------------------+--------------------+--------------------+--------+----+-------------------+-----------+
|0486277577| NULL|    4.0|Pretty good book ...| AGCAAWP1AVVZR|       M. P. Schiesl|{"Format:":" Pape...|           Good book|   false|  13|2004-03-16 00:00:00|       2004|
|0486402029| NULL|    1.0|I don't know how ...|A1X9QQFMPGDW70|          Justabuyer|{"Format:":" Pape...|        Don't bother|   false|   2|2007-09-21 00:00:00|       2007|
|0486402029| NULL|    1.0|These are cute ta...|A1NTAPB1XPB6KK|            Mandible|{"Format:":" Pape...|The worst value I...|   false|   2|2

In [18]:
df_filtrado = df_filtrado.drop('image', 'style', 'vote')

In [19]:
df_filtrado.show()

+----------+-------+--------------------+--------------+--------------------+--------------------+--------+-------------------+-----------+
|      asin|overall|          reviewText|    reviewerID|        reviewerName|             summary|verified|        review_date|review_year|
+----------+-------+--------------------+--------------+--------------------+--------------------+--------+-------------------+-----------+
|0486277577|    4.0|Pretty good book ...| AGCAAWP1AVVZR|       M. P. Schiesl|           Good book|   false|2004-03-16 00:00:00|       2004|
|0486402029|    1.0|I don't know how ...|A1X9QQFMPGDW70|          Justabuyer|        Don't bother|   false|2007-09-21 00:00:00|       2007|
|0486402029|    1.0|These are cute ta...|A1NTAPB1XPB6KK|            Mandible|The worst value I...|   false|2007-01-16 00:00:00|       2007|
|0486402029|    2.0|While my 3 year o...|A34S86H4OJGGVR|An Aerospace Engi...|  Not enough product|    true|2007-01-12 00:00:00|       2007|
|0486402029|    4.0|

In [20]:
from pyspark.sql.functions import col, sum

In [21]:
null_counts = df_filtrado.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_filtrado.columns])
null_counts.show()

+----+-------+----------+----------+------------+-------+--------+-----------+-----------+
|asin|overall|reviewText|reviewerID|reviewerName|summary|verified|review_date|review_year|
+----+-------+----------+----------+------------+-------+--------+-----------+-----------+
|   0|      0|       278|         0|         118|    194|       0|          0|          0|
+----+-------+----------+----------+------------+-------+--------+-----------+-----------+



In [22]:
# Elimina las filas que contienen valores nulos en cualquier columna
df_sin_nulos = df_filtrado.dropna()

In [24]:
null_counts = df_sin_nulos.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_sin_nulos.columns])
null_counts.show()

+----+-------+----------+----------+------------+-------+--------+-----------+-----------+
|asin|overall|reviewText|reviewerID|reviewerName|summary|verified|review_date|review_year|
+----+-------+----------+----------+------------+-------+--------+-----------+-----------+
|   0|      0|         0|         0|           0|      0|       0|          0|          0|
+----+-------+----------+----------+------------+-------+--------+-----------+-----------+



In [26]:
df_sin_nulos.show()

+----------+-------+--------------------+--------------+--------------------+--------------------+--------+-------------------+-----------+
|      asin|overall|          reviewText|    reviewerID|        reviewerName|             summary|verified|        review_date|review_year|
+----------+-------+--------------------+--------------+--------------------+--------------------+--------+-------------------+-----------+
|0486277577|    4.0|Pretty good book ...| AGCAAWP1AVVZR|       M. P. Schiesl|           Good book|   false|2004-03-16 00:00:00|       2004|
|0486402029|    1.0|I don't know how ...|A1X9QQFMPGDW70|          Justabuyer|        Don't bother|   false|2007-09-21 00:00:00|       2007|
|0486402029|    1.0|These are cute ta...|A1NTAPB1XPB6KK|            Mandible|The worst value I...|   false|2007-01-16 00:00:00|       2007|
|0486402029|    2.0|While my 3 year o...|A34S86H4OJGGVR|An Aerospace Engi...|  Not enough product|    true|2007-01-12 00:00:00|       2007|
|0486402029|    4.0|

# No encontramos outliers en la columna overall

In [25]:
df_sin_nulos.select('overall').distinct().show()

+-------+
|overall|
+-------+
|    5.0|
|    2.0|
|    3.0|
|    1.0|
|    4.0|
+-------+



# Completo el analisis y limpieza de Toys Reviews, pasamos con la metadata

In [27]:
df_meta = spark.read.json('/content/drive/My Drive/Version 2018/meta_Toys_and_Games.json.gz')

In [28]:
df_meta.show()

+--------------------+--------------------+----------+--------------------+--------------------+----+--------------------+-------+--------------------+---+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----+-----+--------------------+
|            also_buy|           also_view|      asin|               brand|            category|date|         description|details|             feature|fit|            imageURL|     imageURLHighRes|    main_cat|               price|                rank|        similar_item|tech1|tech2|               title|
+--------------------+--------------------+----------+--------------------+--------------------+----+--------------------+-------+--------------------+---+--------------------+--------------------+------------+--------------------+--------------------+--------------------+-----+-----+--------------------+
|                  []|                  []|0000191639|           Dr. Seuss|[Toy

In [29]:
null_counts = df_meta.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_meta.columns])
null_counts.show()

+--------+---------+----+-----+--------+----+-----------+-------+-------+---+--------+---------------+--------+-----+----+------------+-----+-----+-----+
|also_buy|also_view|asin|brand|category|date|description|details|feature|fit|imageURL|imageURLHighRes|main_cat|price|rank|similar_item|tech1|tech2|title|
+--------+---------+----+-----+--------+----+-----------+-------+-------+---+--------+---------------+--------+-----+----+------------+-----+-----+-----+
|       0|        0|   0|    0|       0|   0|          0|   1462|      0|  0|       0|              0|       0|    0|   0|           0|    0|    0|    0|
+--------+---------+----+-----+--------+----+-----------+-------+-------+---+--------+---------------+--------+-----+----+------------+-----+-----+-----+



In [30]:
df_meta.count()

633883

In [31]:
df_meta = df_meta.dropna()

In [32]:
df_meta.show()

+--------------------+--------------------+----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+---+--------------------+--------------------+-----------------+------+--------------------+--------------------+-----+-----+--------------------+
|            also_buy|           also_view|      asin|              brand|            category|date|         description|             details|             feature|fit|            imageURL|     imageURLHighRes|         main_cat| price|                rank|        similar_item|tech1|tech2|               title|
+--------------------+--------------------+----------+-------------------+--------------------+----+--------------------+--------------------+--------------------+---+--------------------+--------------------+-----------------+------+--------------------+--------------------+-----+-----+--------------------+
|                  []|                  []|6306203230|     The Bookleg

In [33]:
df_meta = df_meta.drop('date', 'details', 'fit', 'imageURL', 'imageURLHighRes', 'tech1', 'tech2')

In [34]:
df_meta.show()

+--------------------+--------------------+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+
|            also_buy|           also_view|      asin|              brand|            category|         description|             feature|         main_cat| price|                rank|        similar_item|               title|
+--------------------+--------------------+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+
|                  []|                  []|6306203230|     The Booklegger|[Toys & Games, Ga...|[Clem Darracott w...|[Made by The Book...|Sports & Outdoors|$28.45|1,890,827 in Spor...|                    |Ben Hogan: In Pur...|
|                  []|                  []|6798648312|              CHXWW|                  []| 

In [35]:
from pyspark.sql.functions import when, size

In [40]:
from pyspark.sql.functions import udf

In [41]:
# Funcion para contar la cantidad de [] en las columnas
@udf(IntegerType())
def contar_listas_vacias(lista):
    return int(len(lista) == 0)



In [42]:
# Agrega una nueva columna 'listas_vacias' al DataFrame
df_meta = df_meta.withColumn("listas_vacias", contar_listas_vacias(df_meta["also_buy"]))

# Suma la nueva columna para obtener la cantidad de listas vacías
cantidad_listas_vacias = df_meta.selectExpr("sum(listas_vacias)").collect()[0][0]

# Also_buy tiene la gran mayoria de elementos vacios.

In [43]:
# Also_buy
cantidad_listas_vacias

485840

# Also_view tiene menos que also_buy pero son la gran mayoria

In [44]:
# Agrega una nueva columna 'listas_vacias' al DataFrame
df_meta = df_meta.withColumn("listas_vacias", contar_listas_vacias(df_meta["also_view"]))

# Suma la nueva columna para obtener la cantidad de listas vacías
cantidad_listas_vacias = df_meta.selectExpr("sum(listas_vacias)").collect()[0][0]

In [45]:
# Also_view
cantidad_listas_vacias

443460

# Category tiene pocos en comparacion a also_view

In [47]:
# Agrega una nueva columna 'listas_vacias' al DataFrame
df_meta = df_meta.withColumn("listas_vacias", contar_listas_vacias(df_meta["category"]))

# Suma la nueva columna para obtener la cantidad de listas vacías
cantidad_listas_vacias = df_meta.selectExpr("sum(listas_vacias)").collect()[0][0]

In [49]:
# Category
cantidad_listas_vacias

62056

In [51]:
# Agrega una nueva columna 'listas_vacias' al DataFrame
df_meta = df_meta.withColumn("listas_vacias", contar_listas_vacias(df_meta["description"]))

# Suma la nueva columna para obtener la cantidad de listas vacías
cantidad_listas_vacias = df_meta.selectExpr("sum(listas_vacias)").collect()[0][0]

In [52]:
cantidad_listas_vacias

81925

In [53]:
# Agrega una nueva columna 'listas_vacias' al DataFrame
df_meta = df_meta.withColumn("listas_vacias", contar_listas_vacias(df_meta["feature"]))

# Suma la nueva columna para obtener la cantidad de listas vacías
cantidad_listas_vacias = df_meta.selectExpr("sum(listas_vacias)").collect()[0][0]

In [54]:
cantidad_listas_vacias

123336

In [56]:
# Agrega una nueva columna 'listas_vacias' al DataFrame
df_meta = df_meta.withColumn("listas_vacias", contar_listas_vacias(df_meta["rank"]))

# Suma la nueva columna para obtener la cantidad de listas vacías
cantidad_listas_vacias = df_meta.selectExpr("sum(listas_vacias)").collect()[0][0]

In [57]:
cantidad_listas_vacias

0

# Borro las columnas also_buy y also_view por la gran cantidad de datos faltantes y listas_vacias porque no la necesito

In [60]:
df_meta = df_meta.drop('also_buy', 'also_view', 'listas_vacias')

In [68]:
df_meta.show()

+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+
|      asin|              brand|            category|         description|             feature|         main_cat| price|                rank|        similar_item|               title|
+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+
|6306203230|     The Booklegger|[Toys & Games, Ga...|[Clem Darracott w...|[Made by The Book...|Sports & Outdoors|$28.45|1,890,827 in Spor...|                    |Ben Hogan: In Pur...|
|6798648312|              CHXWW|                  []|                  []|                  []|     Toys & Games|      |[">#2,100,223 in ...|                    |CHXWW Women's Sim...|
|687567453X|           New toys|                  []|                  []|      

In [69]:
df_meta.show()

+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+
|      asin|              brand|            category|         description|             feature|         main_cat| price|                rank|        similar_item|               title|
+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+--------------------+
|6306203230|     The Booklegger|[Toys & Games, Ga...|[Clem Darracott w...|[Made by The Book...|Sports & Outdoors|$28.45|1,890,827 in Spor...|                    |Ben Hogan: In Pur...|
|6798648312|              CHXWW|                  []|                  []|                  []|     Toys & Games|      |[">#2,100,223 in ...|                    |CHXWW Women's Sim...|
|687567453X|           New toys|                  []|                  []|      

In [71]:
# Lista de nombres de columnas en las que deseas contar los strings vacíos
columnas_a_verificar = ["brand", "main_cat", "price", "similar_item", "title"]

# Crea un diccionario para almacenar la cantidad de strings vacíos por columna
strings_vacios_por_columna = {}

# Itera a través de las columnas especificadas
for columna in columnas_a_verificar:
    # Utiliza la función 'filter' para contar los strings vacíos en cada columna
    cantidad_strings_vacios = df_meta.filter(col(columna) == "").count()
    # Almacena la cantidad en el diccionario
    strings_vacios_por_columna[columna] = cantidad_strings_vacios



In [72]:
# Muestra la cantidad de strings vacíos por columna
for columna, cantidad in strings_vacios_por_columna.items():
    print(f"Cantidad de strings vacíos en '{columna}': {cantidad}")

Cantidad de strings vacíos en 'brand': 6839
Cantidad de strings vacíos en 'main_cat': 4694
Cantidad de strings vacíos en 'price': 312408
Cantidad de strings vacíos en 'similar_item': 430211
Cantidad de strings vacíos en 'title': 21


# Trabajo con una copia de df_meta para no afectar al df original

In [73]:
# Crea una copia del DataFrame df_meta
df_copia = df_meta.alias("df_copia")

In [75]:
# Limpia los strings vacíos en varias columnas
columnas_a_limpiar = ["brand", "main_cat", "price", "similar_item", "title"]  # Agrega aquí las columnas que deseas limpiar

for columna in columnas_a_limpiar:
    df_copia = df_copia.filter(col(columna).isNotNull() & (col(columna) != ""))

In [77]:
df_copia.count()

155539

In [87]:
df_copia.show()

+----------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+
|      asin|               brand|            category|         description|             feature|    main_cat|               price|                rank|        similar_item|               title|
+----------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+
|7115294968|        Kai Tai Inc.|[Toys & Games, Ga...|[30 Days Uncondit...|[30 Days Uncondit...|Toys & Games|              $79.99|[">#332,692 in To...| class="a-bordere...|American Mahjong ...|
|7305081566|             KT Inc.|[Toys & Games, Ga...|[30 Days Uncondit...|[30 Days Uncondit...|Toys & Games|              $65.99|[">#154,220 in To...| class="a-bordere...|4 Pushers + Brand...|
|7842955824|   Gifts by Lulee/

In [82]:
primera_fila = df_copia.limit(1).collect()[0]

# Accede a la celda específica que deseas ver (por ejemplo, la columna 'similar_item')
valor_celda = primera_fila["similar_item"]

In [84]:
valor_celda

' class="a-bordered a-horizontal-stripes  a-spacing-extra-large a-size-base comparison_table">\n\n\n\n            \n            \n            \n            \n            \n            <tr class="comparison_table_image_row">\n                <td class="comparison_table_first_col"></td>\n\n\n                <th class="comparison_image_title_cell" role="columnheader">\n                    <div class="a-row a-spacing-top-micro">\n                        <center>\n                             <img alt="American Mahjong Set with 4 PUSHERS / RACKS COMBO in ALUMINUM CASE (WESTERN Mah jongg 166 Tiles)" src="https://images-na.ssl-images-amazon.com/images/I/41Dv6SqayYL._SL500_AC_SS350_.jpg" id="comparison_image">\n                        </center>\n                    </div>\n                    <div class="a-row a-spacing-top-small">\n                        <div id="comparison_title" class="a-section a-spacing-none">\n                            <span aria-hidden="true" class="a-size-base a-col

# Despues de ver el contenido de similar_item (codigo HTML) lo elimino por su formato y por la gran cantidad de valores faltantes

In [85]:
df_meta = df_meta.drop('similar_item')

In [86]:
df_meta.show()

+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+
|      asin|              brand|            category|         description|             feature|         main_cat| price|                rank|               title|
+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+
|6306203230|     The Booklegger|[Toys & Games, Ga...|[Clem Darracott w...|[Made by The Book...|Sports & Outdoors|$28.45|1,890,827 in Spor...|Ben Hogan: In Pur...|
|6798648312|              CHXWW|                  []|                  []|                  []|     Toys & Games|      |[">#2,100,223 in ...|CHXWW Women's Sim...|
|687567453X|           New toys|                  []|                  []|                  []|     Toys & Games|      |[">#925,302 in To...|Jurassic World Di...|
|7040202328|          

# Revisar la columna precio por la cantidad enorme de valores faltantes

# Guardamos como parquet para ver el tamanio

In [89]:
ruta_del_archivo = "/content/drive/My Drive/archivos_pf/df_toys_limpio.parquet"

# Guarda el DataFrame como un archivo Parquet
df_sin_nulos.write.parquet(ruta_del_archivo)

# Verifica que el archivo se haya guardado correctamente
df_sin_nulos_parquet = spark.read.parquet(ruta_del_archivo)

# Puedes verificar el DataFrame cargado desde Parquet
df_sin_nulos_parquet.show()

+----------+-------+--------------------+--------------+--------------------+--------------------+--------+-------------------+-----------+
|      asin|overall|          reviewText|    reviewerID|        reviewerName|             summary|verified|        review_date|review_year|
+----------+-------+--------------------+--------------+--------------------+--------------------+--------+-------------------+-----------+
|0486277577|    4.0|Pretty good book ...| AGCAAWP1AVVZR|       M. P. Schiesl|           Good book|   false|2004-03-16 00:00:00|       2004|
|0486402029|    1.0|I don't know how ...|A1X9QQFMPGDW70|          Justabuyer|        Don't bother|   false|2007-09-21 00:00:00|       2007|
|0486402029|    1.0|These are cute ta...|A1NTAPB1XPB6KK|            Mandible|The worst value I...|   false|2007-01-16 00:00:00|       2007|
|0486402029|    2.0|While my 3 year o...|A34S86H4OJGGVR|An Aerospace Engi...|  Not enough product|    true|2007-01-12 00:00:00|       2007|
|0486402029|    4.0|

In [88]:
ruta_del_archivo = "/content/drive/My Drive/archivos_pf/df_meta.parquet"

# Guarda el DataFrame como un archivo Parquet
df_meta.write.parquet(ruta_del_archivo)

# Verifica que el archivo se haya guardado correctamente
df_meta_parquet = spark.read.parquet(ruta_del_archivo)

# Puedes verificar el DataFrame cargado desde Parquet
df_meta_parquet.show()

+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+
|      asin|              brand|            category|         description|             feature|         main_cat| price|                rank|               title|
+----------+-------------------+--------------------+--------------------+--------------------+-----------------+------+--------------------+--------------------+
|6306203230|     The Booklegger|[Toys & Games, Ga...|[Clem Darracott w...|[Made by The Book...|Sports & Outdoors|$28.45|1,890,827 in Spor...|Ben Hogan: In Pur...|
|6798648312|              CHXWW|                  []|                  []|                  []|     Toys & Games|      |[">#2,100,223 in ...|CHXWW Women's Sim...|
|687567453X|           New toys|                  []|                  []|                  []|     Toys & Games|      |[">#925,302 in To...|Jurassic World Di...|
|7040202328|          