**1) Create a DataFrame from a List of Tuples**

In [0]:
help(spark.createDataFrame)
# spark.createDataFrame(data, schema)

Help on method createDataFrame in module pyspark.sql.connect.session:

createDataFrame(data: Union[ForwardRef('pd.DataFrame'), ForwardRef('np.ndarray'), ForwardRef('pa.Table'), Iterable[Any]], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, List[str], Tuple[str, ...], NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: Optional[bool] = None) -> 'ParentDataFrame' method of pyspark.sql.connect.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`,
    a :class:`numpy.ndarray`, or a :class:`pyarrow.Table`.

    .. versionadded:: 2.0.0

    .. versionchanged:: 3.4.0
        Supports Spark Connect.

    .. versionchanged:: 4.0.0
        Supports :class:`pyarrow.Table`.

    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, ``dict``, etc.), or :class:`

In [0]:
data =[(1, "Naresh", 10.5, 9980133778, "2025-01-03"),
       (2, "Harish", 12.5, 9980133778, "2024-01-03"),
       (3, "Harish", 12.5, 9980133778, "2024-01-03")]
       
schema = ["id", "name", "salary", "phone", "dob"]

df = spark.createDataFrame(data, schema)
df.show()
display(df)
df.printSchema()

+---+------+------+----------+----------+
| id|  name|salary|     phone|       dob|
+---+------+------+----------+----------+
|  1|Naresh|  10.5|9980133778|2025-01-03|
|  2|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
+---+------+------+----------+----------+



id,name,salary,phone,dob
1,Naresh,10.5,9980133778,2025-01-03
2,Harish,12.5,9980133778,2024-01-03
3,Harish,12.5,9980133778,2024-01-03


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- phone: long (nullable = true)
 |-- dob: string (nullable = true)



- Spark automatically **infers data types**.

In [0]:
data =[(1, "Naresh", 10.5, 9980133778, "2025-01-03"),
       (2, "Harish", 12.5, 9980133778, "2024-01-03"),
       (3, "Harish", 12.5, 9980133778, "2024-01-03"),
       (3, "Harish", "data", 9980133778, "2024-01-03")]
       
schema = ["id", "name", "salary", "phone", "dob"]

df1 = spark.createDataFrame(data, schema)
df1.show()
display(df1)
df1.printSchema()

+---+------+------+----------+----------+
| id|  name|salary|     phone|       dob|
+---+------+------+----------+----------+
|  1|Naresh|  10.5|9980133778|2025-01-03|
|  2|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  data|9980133778|2024-01-03|
+---+------+------+----------+----------+



id,name,salary,phone,dob
1,Naresh,10.5,9980133778,2025-01-03
2,Harish,12.5,9980133778,2024-01-03
3,Harish,12.5,9980133778,2024-01-03
3,Harish,data,9980133778,2024-01-03


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- phone: long (nullable = true)
 |-- dob: string (nullable = true)



In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, LongType

data =[(1, "Naresh", 10.5, 9980133778, "2025-01-03"),
       (2, "Harish", 12.5, 9980133778, "2024-01-03"),
       (3, "Harish", 12.5, 9980133778, "2024-01-03")]
       
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("phone", LongType(), True),
    StructField("dob", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()
display(df)
df.printSchema()

+---+------+------+----------+----------+
| id|  name|salary|     phone|       dob|
+---+------+------+----------+----------+
|  1|Naresh|  10.5|9980133778|2025-01-03|
|  2|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
+---+------+------+----------+----------+



id,name,salary,phone,dob
1,Naresh,10.5,9980133778,2025-01-03
2,Harish,12.5,9980133778,2024-01-03
3,Harish,12.5,9980133778,2024-01-03


root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- phone: long (nullable = true)
 |-- dob: string (nullable = true)



**2) Create a DataFrame from a List of Lists**

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, LongType

data =[[1, "Naresh", 10.5, 9980133778, "2025-01-03"],
       [2, "Harish", 12.5, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, "2024-01-03"]]
       
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("phone", LongType(), True),
    StructField("dob", StringType(), True)
])

df_list_list = spark.createDataFrame(data, schema)
df_list_list.show()
display(df_list_list)
df_list_list.printSchema()

+---+------+------+----------+----------+
| id|  name|salary|     phone|       dob|
+---+------+------+----------+----------+
|  1|Naresh|  10.5|9980133778|2025-01-03|
|  2|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
+---+------+------+----------+----------+



id,name,salary,phone,dob
1,Naresh,10.5,9980133778,2025-01-03
2,Harish,12.5,9980133778,2024-01-03
3,Harish,12.5,9980133778,2024-01-03


root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- phone: long (nullable = true)
 |-- dob: string (nullable = true)



**3) Create a DataFrame using dictionary**
- Useful when data comes as **JSON-like objects**.
- **Column order** is **not guaranteed**.

- Syntax: 
  - spark.createDataFrame(data)
  - **Schema** is **inferred automatically**.

In [0]:
data = [
    {'id': 1, 'name': 'John', 'salary': 10.5, 'phone': 9980133778, 'dob': '2025-01-03'},
    {'id': 2, 'name': 'Jane', 'salary': 12.5, 'phone': 9980133778, 'dob': '2024-01-03'},
    {'id': 3, 'name': 'Jane', 'salary': 12.5, 'phone': 9980133778, 'dob': '2024-01-03'}
]

df_dict = spark.createDataFrame(data)
display(df_dict)

dob,id,name,phone,salary
2025-01-03,1,John,9980133778,10.5
2024-01-03,2,Jane,9980133778,12.5
2024-01-03,3,Jane,9980133778,12.5


**4) Create a DataFrame with an Explicit Schema**

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, LongType

data =[[1, "Naresh", 10.5, 9980133778, "2025-01-03"],
       [2, "Harish", 12.5, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, "2024-01-03"]]
       
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("phone", LongType(), True),
    StructField("dob", StringType(), True)
])

df_list_list = spark.createDataFrame(data, schema)
df_list_list.show()
display(df_list_list)
df_list_list.printSchema()

+---+------+------+----------+----------+
| id|  name|salary|     phone|       dob|
+---+------+------+----------+----------+
|  1|Naresh|  10.5|9980133778|2025-01-03|
|  2|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
+---+------+------+----------+----------+



id,name,salary,phone,dob
1,Naresh,10.5,9980133778,2025-01-03
2,Harish,12.5,9980133778,2024-01-03
3,Harish,12.5,9980133778,2024-01-03


root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- phone: long (nullable = true)
 |-- dob: string (nullable = true)



**5) How to create dataframe with NULL values?**

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, LongType

data =[[1, "Naresh", 10.5, 9980133778, "2025-01-03"],
       [2, "Harish", None, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, None, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, None],
       [3, None, 12.5, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, "2024-01-03"]]
       
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("salary", DoubleType(), True),
    StructField("phone", LongType(), True),
    StructField("dob", StringType(), True)
])

df_null = spark.createDataFrame(data, schema)
df_null.show()
display(df_null)
df_null.printSchema()

+---+------+------+----------+----------+
| id|  name|salary|     phone|       dob|
+---+------+------+----------+----------+
|  1|Naresh|  10.5|9980133778|2025-01-03|
|  2|Harish|  NULL|9980133778|2024-01-03|
|  3|Harish|  12.5|      NULL|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|      NULL|
|  3|  NULL|  12.5|9980133778|2024-01-03|
|  3|Harish|  12.5|9980133778|2024-01-03|
+---+------+------+----------+----------+



id,name,salary,phone,dob
1,Naresh,10.5,9980133778.0,2025-01-03
2,Harish,,9980133778.0,2024-01-03
3,Harish,12.5,,2024-01-03
3,Harish,12.5,9980133778.0,2024-01-03
3,Harish,12.5,9980133778.0,
3,,12.5,9980133778.0,2024-01-03
3,Harish,12.5,9980133778.0,2024-01-03


root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- phone: long (nullable = true)
 |-- dob: string (nullable = true)



In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType, LongType

data =[[1, "Naresh", 10.5, 9980133778, "2025-01-03"],
       [2, "Harish", None, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, None, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, "2024-01-03"],
       [3, "Harish", 12.5, 9980133778, None],
       [3, None, 12.5, 9980133778, "2024-01-03"],
       [None, "Harish", 12.5, 9980133778, "2024-01-03"]]
       
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("salary", DoubleType(), True),
    StructField("phone", LongType(), True),
    StructField("dob", StringType(), True)
])

df_null_01 = spark.createDataFrame(data, schema)
df_null_01.show()
display(df_null_01)
df_null_01.printSchema()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkValueError[0m                         Traceback (most recent call last)
File [0;32m<command-6167684495832718>, line 19[0m
[1;32m      3[0m data [38;5;241m=[39m[[[38;5;241m1[39m, [38;5;124m"[39m[38;5;124mNaresh[39m[38;5;124m"[39m, [38;5;241m10.5[39m, [38;5;241m9980133778[39m, [38;5;124m"[39m[38;5;124m2025-01-03[39m[38;5;124m"[39m],
[1;32m      4[0m        [[38;5;241m2[39m, [38;5;124m"[39m[38;5;124mHarish[39m[38;5;124m"[39m, [38;5;28;01mNone[39;00m, [38;5;241m9980133778[39m, [38;5;124m"[39m[38;5;124m2024-01-03[39m[38;5;124m"[39m],
[1;32m      5[0m        [[38;5;241m3[39m, [38;5;124m"[39m[38;5;124mHarish[39m[38;5;124m"[39m, [38;5;241m12.5[39m, [38;5;28;01mNone[39;00m, [38;5;124m"[39m[38;5;124m2024-01-03[39m[38;5;124m"[39m],
[0;32m   (...)[0m
[1;32m      8[0m        [[38;5;241m3[39m, [38;5;28;01mNone[39;00m, [38;5;241

- df.show() Vs df.display()
- df.printSchema()
- df.columns
- df.count()
- df.schema
- df.dtypes
- dict(df.dtypes)

In [0]:
df.columns

['id', 'name', 'salary', 'phone', 'dob']

In [0]:
len(df.columns)

5

**6) Create Empty DataFrame without Schema (no columns)**

In [0]:
df3 = spark.createDataFrame([], StructType([]))
display(df3)

**7) Create Empty DataFrame with Schema**

In [0]:
# Create Schema
schema = StructType([
  StructField('FirstName', StringType(), True),
  StructField('LastName', IntegerType(), True),
  StructField('Records', IntegerType(), True),
  StructField('Product_Type', StringType(), True),
  StructField('transaction_date', StringType(), True),
  StructField('current_timestamp', StringType(), True),
  StructField('Category', StringType(), True)
  ])

In [0]:
# Create empty DataFrame directly.
emp_df = spark.createDataFrame([], schema)
display(emp_df)

FirstName,LastName,Records,Product_Type,transaction_date,current_timestamp,Category


**8) Creating empty DataFrame with NULL's**

In [0]:
# Define schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Sales", IntegerType(), True),
])

# Just placeholders with all nulls
empty_data = [(None, None, None, None),
              (None, None, None, None),
              (None, None, None, None),
              (None, None, None, None),
              (None, None, None, None)]

df_empty = spark.createDataFrame(empty_data, schema)
# display(df_empty)
df_empty.display()

id,Name,Age,Sales
,,,
,,,
,,,
,,,
,,,


In [0]:
df_empty.count()

5

In [0]:
df.schema

StructType([StructField('id', IntegerType(), False), StructField('name', StringType(), True), StructField('salary', DoubleType(), True), StructField('phone', LongType(), True), StructField('dob', StringType(), True)])

In [0]:
df.dtypes

[('id', 'int'),
 ('name', 'string'),
 ('salary', 'double'),
 ('phone', 'bigint'),
 ('dob', 'string')]

In [0]:
dict(df.dtypes)

{'id': 'int',
 'name': 'string',
 'salary': 'double',
 'phone': 'bigint',
 'dob': 'string'}