# Dataframe

In [12]:
from pyspark.sql import SparkSession, Row
from faker import Faker

In [13]:
spark = SparkSession.builder.appName("createDataFrameOverview3").getOrCreate()

In [14]:
fake = Faker(locale="es_MX")

def create_employees(num_employees):
    employee_list = []
    
    for _ in range(1, num_employees):
        employee = {}
        employee["first_name"] = fake.first_name()
        employee["last_name"] = fake.last_name()
        employee["job"] = fake.job()
        employee["department"] = fake.random_element(elements=("IT", "HR", "Marketing", "Finance"))
        employee["role"] = fake.random_element(elements=("Manager", "Developer", "Analyst", "Associate"))
        employee["salary"] = fake.random_int(min=30000, max=150000, step=1000)
        employee["email"] = fake.email()
        employee_list.append(employee)
    
    return spark.createDataFrame(employee_list)

In [15]:
df = create_employees(50)

In [16]:
df.show(truncate=False)

+----------+---------------------------------+-----------+-------------------------------------------------------------------+----------+---------+------+
|department|email                            |first_name |job                                                                |last_name |role     |salary|
+----------+---------------------------------+-----------+-------------------------------------------------------------------+----------+---------+------+
|HR        |barbaraporras@example.com        |Ángela     |Bibliotecarios                                                     |Niño      |Manager  |114000|
|IT        |ksepulveda@example.com           |Guillermo  |Cazador                                                            |Reyna     |Associate|36000 |
|IT        |roybalrenato@example.org         |Inés       |Director de publicidad y relaciones públicas                       |Colón     |Developer|120000|
|IT        |francisco-javier76@example.net   |Mayte      |Controlador 

In [17]:
df.schema

StructType([StructField('department', StringType(), True), StructField('email', StringType(), True), StructField('first_name', StringType(), True), StructField('job', StringType(), True), StructField('last_name', StringType(), True), StructField('role', StringType(), True), StructField('salary', LongType(), True)])

In [18]:
df.printSchema()

root
 |-- department: string (nullable = true)
 |-- email: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- job: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- role: string (nullable = true)
 |-- salary: long (nullable = true)



In [19]:
def create_employees_1(num_employees):
    Employee = Row("first_name", "last_name", "job", "department", "role", "salary", "email")

    data = []
    for _ in range(num_employees):
        data.append(
            Employee(
                fake.first_name(),
                fake.last_name(),
                fake.job(),
                fake.random_element(elements=("TI", "HHRR", "Marketing", "Finanzas")),
                fake.random_element(elements=("Administrador", "Programador", "Analista", "Asociado")),
                fake.random_int(min=30000, max=150000, step=1000),
                fake.email()
            )
        )
    
    return data

df_employees_1 = spark.createDataFrame(create_employees_1(30))

df_employees_1.show(truncate=False)

+--------------+-----------+--------------------------------------------------------+----------+-------------+------+------------------------------+
|first_name    |last_name  |job                                                     |department|role         |salary|email                         |
+--------------+-----------+--------------------------------------------------------+----------+-------------+------+------------------------------+
|María         |Rodrígez   |Reponedor de estanterías                                |Marketing |Analista     |96000 |daviles@example.net           |
|María Cristina|Marrero    |Técnico en redes y sistemas de computadores             |Marketing |Administrador|125000|zcavazos@example.org          |
|Joaquín       |de la Garza|Empleado encargado de las nóminas                       |TI        |Programador  |40000 |jaquelineescobedo@example.net |
|Rosa          |Casares    |Técnico en ingeniería mecánica                          |Marketing |Programado

In [20]:
df_employees_1.schema

StructType([StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('job', StringType(), True), StructField('department', StringType(), True), StructField('role', StringType(), True), StructField('salary', LongType(), True), StructField('email', StringType(), True)])

In [21]:
df_employees_1.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- job: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- email: string (nullable = true)

