Creating Dataframe

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').appName('sparkByExamples.com').getOrCreate()

In [3]:
type(spark)

pyspark.sql.session.SparkSession

In [4]:
dir(spark)

['Builder',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_convert_from_pandas',
 '_createFromLocal',
 '_createFromRDD',
 '_create_from_pandas_with_arrow',
 '_get_numpy_record_dtype',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedSession',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_jwrapped',
 '_repr_html_',
 '_sc',
 '_wrapped',
 'builder',
 'catalog',
 'conf',
 'createDataFrame',
 'newSession',
 'range',
 'read',
 'readStream',
 'sparkContext',
 'sql',
 'stop',
 'streams',
 'table',
 'udf',
 'version']

In [5]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True) method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.
    
    When ``schema`` is a list of column names, the type of each column
    will be inferred from ``data``.
    
    When ``schema`` is ``None``, it will try to infer the schema (column names and types)
    from ``data``, which should be an RDD of :class:`Row`,
    or :class:`namedtuple`, or :class:`dict`.
    
    When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
    the real data, or an exception will be thrown at runtime. If the given schema is not
    :class:`pyspark.sql.types.StructType`, it will be wrapped into a
    :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
    each record will also be wrapped into a tuple,

In [6]:
data = [(1, 'Gagan'), (2, 'Khandelwal')]

rdd1 = spark.createDataFrame(data=data)
rdd1.show()

+---+----------+
| _1|        _2|
+---+----------+
|  1|     Gagan|
|  2|Khandelwal|
+---+----------+



In [7]:
rdd1 = spark.createDataFrame(data=data, schema=['id', 'name'])
rdd1.show()
rdd1.printSchema()

+---+----------+
| id|      name|
+---+----------+
|  1|     Gagan|
|  2|Khandelwal|
+---+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [8]:
from pyspark.sql.types import *
StructType({})
rdd1 = spark.createDataFrame(data=data, schema=['id', 'name'])
rdd1.show()
rdd1.printSchema()

+---+----------+
| id|      name|
+---+----------+
|  1|     Gagan|
|  2|Khandelwal|
+---+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
from pyspark.sql.types import *
data = [(1, 'Gagan'), (2, 'Khandelwal')]
schema = StructType([StructField(name='id', dataType=IntegerType()), StructField(name='Name', dataType=StringType())])

df = spark.createDataFrame(data = data, schema=schema)
df.show()
df.printSchema()

+---+----------+
| id|      Name|
+---+----------+
|  1|     Gagan|
|  2|Khandelwal|
+---+----------+

root
 |-- id: integer (nullable = true)
 |-- Name: string (nullable = true)



In [13]:
from pyspark.sql.types import *
data = [{'id':1, 'name':'Gagan'}, {'id':2, 'name':'Khandelwal'}]

df = spark.createDataFrame(data = data)
df.show()
df.printSchema()

+---+----------+
| id|      name|
+---+----------+
|  1|     Gagan|
|  2|Khandelwal|
+---+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

