In [0]:
from pyspark.sql.types import *

In [0]:
spark.createDataFrame()

In [0]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    When ``schema`` is a list of column names, the type of each column
    will be inferred from ``data``.
    
    When ``schema`` is ``None``, it will try to infer the schema (column names and types)
    from ``data``, which should be an RDD of either :class:`Row`,
    :class:`namedtuple`, or :class:`dict`.
    
    When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
    the real data, or

In [0]:
dados = [('Fred',39)
,('Romário', 57)
,('Roberto Carlos', 50)
,('Ronaldo Fenômeno', 47)]

df = spark.createDataFrame(dados)

df.show()

+----------------+---+
|              _1| _2|
+----------------+---+
|            Fred| 39|
|         Romário| 57|
|  Roberto Carlos| 50|
|Ronaldo Fenômeno| 47|
+----------------+---+



In [0]:
dados = [('Fred',39)
,('Romário', 57)
,('Roberto Carlos', 50)
,('Ronaldo Fenômeno', 47)]

schema = ['nome','idade']

df = spark.createDataFrame(data=dados, schema=schema)

df.show()

+----------------+-----+
|            nome|idade|
+----------------+-----+
|            Fred|   39|
|         Romário|   57|
|  Roberto Carlos|   50|
|Ronaldo Fenômeno|   47|
+----------------+-----+



In [0]:
dados = [('Fred',39)
,('Romário', 57)
,('Roberto Carlos', 50)
,('Ronaldo Fenômeno', 47)]

schema = ['nome','idade']

df = spark.createDataFrame(dados, ['nome','idade'])

df.show()

+----------------+-----+
|            nome|idade|
+----------------+-----+
|            Fred|   39|
|         Romário|   57|
|  Roberto Carlos|   50|
|Ronaldo Fenômeno|   47|
+----------------+-----+



In [0]:
dados = [{'Jogador':'Fred', 'idade' : 39}
,{'Jogador':'Romário', 'idade' : 57}
,{'Jogador':'Roberto Carlos', 'idade' : 50}
,{'Jogador':'Ronaldo Fenômeno', 'idade' : 47}]

df2 = spark.createDataFrame(data=dados)

df2.show()


+----------------+-----+
|         Jogador|idade|
+----------------+-----+
|            Fred|   39|
|         Romário|   57|
|  Roberto Carlos|   50|
|Ronaldo Fenômeno|   47|
+----------------+-----+



In [0]:
help(StructType)
#StructField

Help on class StructType in module pyspark.sql.types:

class StructType(DataType)
 |  StructType(fields: Optional[List[pyspark.sql.types.StructField]] = None)
 |  
 |  Struct type, consisting of a list of :class:`StructField`.
 |  
 |  This is the data type representing a :class:`Row`.
 |  
 |  Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
 |  A contained :class:`StructField` can be accessed by its name or position.
 |  
 |  Examples
 |  --------
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1["f1"]
 |  StructField('f1', StringType(), True)
 |  >>> struct1[0]
 |  StructField('f1', StringType(), True)
 |  
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", CharType(10), True)])
 |  >>> struct2 = StructType([StructField("f1", CharType(10), True)])
 

In [0]:
dados = [('Fred', 39)
,('Romário',57)
,('Roberto Carlos',50)
,('Ronaldo Fenômeno',47)]

schema = StructType([StructField('jogador', StringType(), True)
            ,StructField('idade', IntegerType(), True)])

df = spark.createDataFrame(data=dados, schema=schema)

df.show()


+----------------+-----+
|         jogador|idade|
+----------------+-----+
|            Fred|   39|
|         Romário|   57|
|  Roberto Carlos|   50|
|Ronaldo Fenômeno|   47|
+----------------+-----+



In [0]:
dados = [('Fred',39)
,('Romário', 57)
,('Roberto Carlos', 50)
,('Ronaldo Fenômeno', 47)]

df = spark.createDataFrame(dados, "nome:string,idade:string")

df.show()

+----------------+-----+
|            nome|idade|
+----------------+-----+
|            Fred|   39|
|         Romário|   57|
|  Roberto Carlos|   50|
|Ronaldo Fenômeno|   47|
+----------------+-----+

