In [7]:
from faker import Faker
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import count, avg, sum, min, max, round

In [8]:
fake = Faker(locale="es_MX")

schema = StructType([
    StructField("nombre", StringType(), True),
    StructField("apellido", StringType(), True),
    StructField("genero", StringType(), True),
    StructField("edad", IntegerType(), True),
    StructField("distrito", StringType(), True),
    StructField("salario", DoubleType(), True),
    StructField("categoria", StringType(), True)
])

spark = SparkSession.builder.appName("createDataFrameOverview4").getOrCreate()

Setting Spark log level to "ERROR".


In [9]:
def crear_socios(numero_socios, genero_socio):
    data = []
    for _ in range(numero_socios):
        if genero_socio == "Masculino":
            data.append(
                Row(
                    nombre=fake.first_name_male(),
                    apellido=fake.last_name(),
                    genero=genero_socio,
                    edad=fake.random_int(min=18, max=99, step=1),
                    distrito=fake.random_element(elements=("Moyobamba", "Calzada", "Habana", "Jepelacio", "Soritor", "Yantalo")),
                    salario=fake.random_int(min=600, max=9000, step=1)*1.0,
                    categoria=fake.random_element(elements=("Rojo", "Azul", "Amarillo", "Verde"))
                )
            )
        elif genero_socio == "Femenino":
            data.append(
                Row(
                    nombre=fake.first_name_female(),
                    apellido=fake.last_name(),
                    genero=genero_socio,
                    edad=fake.random_int(min=18, max=99, step=1),
                    distrito=fake.random_element(elements=("Moyobamba", "Calzada", "Habana", "Jepelacio", "Soritor", "Yantalo")),
                    salario=fake.random_int(min=600, max=9000, step=1)*1.0,
                    categoria=fake.random_element(elements=("Rojo", "Azul", "Amarillo", "Verde"))
                )
            )
    
    return data

In [10]:
df_socios_hombres = spark.createDataFrame(crear_socios(250, "Masculino"), schema)
df_socios_mujeres = spark.createDataFrame(crear_socios(100, "Femenino"), schema)

df_socios = df_socios_hombres.union(df_socios_mujeres)

df_socios.show()

+--------+---------+---------+----+---------+-------+---------+
|  nombre| apellido|   genero|edad| distrito|salario|categoria|
+--------+---------+---------+----+---------+-------+---------+
|   Paola|Feliciano|Masculino|  35|Jepelacio| 6517.0|    Verde|
|   Juana|  Casares|Masculino|  48|Moyobamba| 8268.0|     Azul|
|   Tomás|  Borrego|Masculino|  34|  Soritor| 8003.0| Amarillo|
|Fernando|    Marín|Masculino|  19|Moyobamba| 3263.0|     Azul|
|  Samuel|   Valles|Masculino|  49|  Calzada| 6771.0|     Rojo|
| Vicente|  Apodaca|Masculino|  56|  Yantalo| 6986.0|    Verde|
|  Felipe|   Tirado|Masculino|  40|Jepelacio|  654.0| Amarillo|
| Lorenzo|   Orozco|Masculino|  62|  Calzada| 2266.0| Amarillo|
|  Genaro|    Tamez|Masculino|  37|  Yantalo| 2673.0|     Azul|
| Alberto|  Alvarez|Masculino|  61|Moyobamba| 1119.0|     Azul|
|   Felix|  Saucedo|Masculino|  97|Moyobamba| 6455.0|     Azul|
|    José|    Matos|Masculino|  36|Jepelacio| 1817.0|     Rojo|
|  Olivia| Esquivel|Masculino|  42|Moyob

                                                                                

In [11]:
df_socios.groupBy("distrito") \
.agg(count("distrito").alias("cantidad"), 
     round(avg("edad"), 2).alias("promedio_edad"),
     round(sum("salario"), 2).alias("total_salario"),
     round(avg("salario"), 2).alias("promedio_salario"),
     min("salario").alias("min_salario"),
     max("salario").alias("max_salario")).show() 

+---------+--------+-------------+-------------+----------------+-----------+-----------+
| distrito|cantidad|promedio_edad|total_salario|promedio_salario|min_salario|max_salario|
+---------+--------+-------------+-------------+----------------+-----------+-----------+
|  Calzada|      61|        60.16|     304435.0|         4990.74|      619.0|     8771.0|
|   Habana|      55|        56.91|     239480.0|         4354.18|      724.0|     8584.0|
|  Yantalo|      62|         53.9|     305924.0|         4934.26|      752.0|     8926.0|
|  Soritor|      61|        62.13|     261255.0|         4282.87|      619.0|     8956.0|
|Jepelacio|      48|         56.9|     241158.0|         5024.13|      654.0|     8986.0|
|Moyobamba|      63|        55.22|     305208.0|         4844.57|      930.0|     8990.0|
+---------+--------+-------------+-------------+----------------+-----------+-----------+



In [12]:
spark.stop()