In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [19]:
import findspark

findspark.init()

## Criando Data Frame

In [20]:
spark = SparkSession.builder.getOrCreate()

In [7]:
df = spark.createDataFrame([('Pedro', '4'), ('João', '5')], schema = 'nome STRING, id STRING')

In [11]:
df.show()

+-----+---+
| nome| id|
+-----+---+
|Pedro|  4|
| João|  5|
+-----+---+



## Convertendo Colunas

In [21]:
from pyspark.sql.types import *

In [15]:
df.select('nome', col('id').cast(IntegerType()))

DataFrame[nome: string, id: int]

In [16]:
df.select('nome', col('id').cast('Int'))

DataFrame[nome: string, id: int]

## Schema e Criação de Data Frame

In [23]:
df = spark.createDataFrame([('Pedro', '4'), ('João', '5'), ('Lucia', '6'), ('Juliana', '7')], schema = ['nome', 'id'])

In [26]:
df.dtypes

[('nome', 'string'), ('id', 'string')]

## Criando schemas

In [28]:
schema = StructType([
    StructField('nome', StringType()),
    StructField('id', IntegerType())
])

In [30]:
df = spark.createDataFrame([('Pedro', 4), ('João', 5), ('Lucia', 6), ('Juliana', 7)], schema = schema)

In [31]:
df.show()

+-------+---+
|   nome| id|
+-------+---+
|  Pedro|  4|
|   João|  5|
|  Lucia|  6|
|Juliana|  7|
+-------+---+



## Criando Schema com DDL

In [35]:
schema = 'nome STRING, id INT'

df = spark.createDataFrame([('Pedro', 4), ('João', 5), ('Lucia', 6), ('Juliana', 7)], schema = schema)

In [34]:
df.dtypes

[('nome', 'string'), ('id', 'int')]

## Criando Data Frame

In [36]:
data = [('Pedro', 4), ('João', 5), ('Lucia', 6), ('Juliana', 7)]

In [37]:
df = spark.createDataFrame(data, schema = schema)

In [39]:
df.printSchema()

root
 |-- nome: string (nullable = true)
 |-- id: integer (nullable = true)



In [38]:
df.show()

+-------+---+
|   nome| id|
+-------+---+
|  Pedro|  4|
|   João|  5|
|  Lucia|  6|
|Juliana|  7|
+-------+---+



In [40]:
spark.stop()