In [0]:
spark

-------------- Work On Create Dataframe

In [0]:
# create dataframe normal way...
data = [(1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 2), (7, 2)]
schema = ["id", "num"]
df = spark.createDataFrame(data, schema)
df.show()


# create dataframe using the row keyword...
from pyspark.sql import Row
row_data = [Row(name='John', age=25, salary=20000, city='Los Angales'),
        Row(name='Mary', age=32, salary=32000, city='Chicago'),
        Row(name='Bob', age=21, salary=27000, city='New York'),
        Row(name='Nik', age=35, salary=20999 , city='India')]
row_schema = ['name', 'age', 'salary', 'city']
row_df = spark.createDataFrame(row_data, row_schema)
row_df.show()


+---+---+
| id|num|
+---+---+
|  1|  1|
|  2|  1|
|  3|  1|
|  4|  2|
|  5|  1|
|  6|  2|
|  7|  2|
+---+---+

+----+---+------+-----------+
|name|age|salary|       city|
+----+---+------+-----------+
|John| 25| 20000|Los Angales|
|Mary| 32| 32000|    Chicago|
| Bob| 21| 27000|   New York|
| Nik| 35| 20999|      India|
+----+---+------+-----------+



------------ Work On Dataframe Schema

In [0]:
# print the schema in the existing dataframe...
row_df.printSchema()


# print the column in the existing dataframe...
print('columns is : ', row_df.columns)
print('\n')

# create the dataframe with manual schema....
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
row_data = [
             Row(id=1, name='John', age=25, salary=20000, city='Los Angales'),
             Row(id=2, name='Mary', age=32, salary=32000, city='Chicago'),
             Row(id=3, name='Bob', age=21, salary=27000, city='New York'),
             Row(id=4, name='Nik', age=35, salary=20999 , city='India')
            ]
# row_schema = ['name', 'age', 'salary', 'city']  # not use this normal schema
manual_schema = StructType([
                    StructField('id', IntegerType(), True),
                    StructField('name', StringType(), True),
                    StructField('age', IntegerType(), True),
                    StructField('salary', IntegerType(), True),
                    StructField('city', StringType(), True)
                ])

manual_schema_df = spark.createDataFrame(row_data, manual_schema)
manual_schema_df.show()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- city: string (nullable = true)

columns is :  ['name', 'age', 'salary', 'city']


+---+----+---+------+-----------+
| id|name|age|salary|       city|
+---+----+---+------+-----------+
|  1|John| 25| 20000|Los Angales|
|  2|Mary| 32| 32000|    Chicago|
|  3| Bob| 21| 27000|   New York|
|  4| Nik| 35| 20999|      India|
+---+----+---+------+-----------+



----------- Work On Select The Column In Different Ways..

In [0]:
from pyspark.sql.functions import *   # col is available here..
from pyspark.sql.types import *

# select the column with only name..
manual_schema_df.select('name').show()

# select column by using col keyword.
# col is an expression which is use to apply set of tranformation on the one or more record in specific column.
manual_schema_df.select(col('id')+1).show()
# same transformation apply using only column name
# manual_schema_df.select('id' + 1).show()  
# but it will do the issue so it will resolve using the expr keyword.
manual_schema_df.select(expr('id') + 1).show()


# select column using df...
manual_schema_df.select(manual_schema_df['city']).show()

# select column using df and column name..
manual_schema_df.select(manual_schema_df.city).show()

# select multiple columns
manual_schema_df.select('id', col('name'), manual_schema_df['salary'], manual_schema_df.city).show()

# select entire columns
manual_schema_df.select('*').show() 

+----+
|name|
+----+
|John|
|Mary|
| Bob|
| Nik|
+----+

+--------+
|(id + 1)|
+--------+
|       2|
|       3|
|       4|
|       5|
+--------+

+--------+
|(id + 1)|
+--------+
|       2|
|       3|
|       4|
|       5|
+--------+

+-----------+
|       city|
+-----------+
|Los Angales|
|    Chicago|
|   New York|
|      India|
+-----------+

+-----------+
|       city|
+-----------+
|Los Angales|
|    Chicago|
|   New York|
|      India|
+-----------+

+---+----+------+-----------+
| id|name|salary|       city|
+---+----+------+-----------+
|  1|John| 20000|Los Angales|
|  2|Mary| 32000|    Chicago|
|  3| Bob| 27000|   New York|
|  4| Nik| 20999|      India|
+---+----+------+-----------+

+---+----+---+------+-----------+
| id|name|age|salary|       city|
+---+----+---+------+-----------+
|  1|John| 25| 20000|Los Angales|
|  2|Mary| 32| 32000|    Chicago|
|  3| Bob| 21| 27000|   New York|
|  4| Nik| 35| 20999|      India|
+---+----+---+------+-----------+



--------- Work On Spark SQL

In [0]:
# fetch the data using spark sql, In which require to write sql query..
# first convert the df into temporary table..
manual_schema_df.createOrReplaceTempView("manual_schema_df_tbl")

# fetch the data from given table name..
spark.sql("""
          select * from manual_schema_df_tbl
          """).show()

+---+----+---+------+-----------+
| id|name|age|salary|       city|
+---+----+---+------+-----------+
|  1|John| 25| 20000|Los Angales|
|  2|Mary| 32| 32000|    Chicago|
|  3| Bob| 21| 27000|   New York|
|  4| Nik| 35| 20999|      India|
+---+----+---+------+-----------+



---------- Work On Alias, Filter/Where, Literal, Adding Columns, Renaming Columns, Casting DataType, Removing Columns

In [0]:
# alias the dataframe and column name..
# alias is used temporary changes. and rename is used for parmanent changes of column name.
ms_df = manual_schema_df.alias('ms_df')
ms_df.show()
ms_df.select(col('id').alias('emp_id'), 'name', 'age', 'salary').show()



# use filter / where to find out the satisfied condition records..
ms_df.filter(col('salary') > 20000).show()
ms_df.filter((col('salary') > 20000) & (col('age') < 35)).show()



# use literal for create the new column in existing dataframe with default/null value..
ms_df = ms_df.select('*', lit('null').alias('last_name'))
ms_df.show()


# adding or overwrite the columns in existing dataframe..
ms_df = ms_df.withColumn('last_name', lit('deo'))  # overwrite
ms_df = ms_df.withColumn('sur_name', lit('null'))# create new
ms_df.show()


# rename column in existing dataframe..
ms_df = ms_df.withColumnRenamed('id', 'emp_id')
ms_df.show()


# casting the column datatype in existing dataframe..
ms_df = ms_df.withColumn('emp_id', col('emp_id').cast('string'))
ms_df.withColumn('salary', col('salary').cast('long')).printSchema()


# remove the existing column in dataframe..
ms_df.drop('sur_name').show()


# Usage of lit() with withColumn()
from pyspark.sql.functions import when, lit, col
df3 = ms_df.withColumn("lit_value2", when((col("salary") >=40000) & (col("salary") <= 50000),lit("100")).otherwise(lit("200")))
df3.show(truncate=False)

+---+----+---+------+-----------+
| id|name|age|salary|       city|
+---+----+---+------+-----------+
|  1|John| 25| 20000|Los Angales|
|  2|Mary| 32| 32000|    Chicago|
|  3| Bob| 21| 27000|   New York|
|  4| Nik| 35| 20999|      India|
+---+----+---+------+-----------+

+------+----+---+------+
|emp_id|name|age|salary|
+------+----+---+------+
|     1|John| 25| 20000|
|     2|Mary| 32| 32000|
|     3| Bob| 21| 27000|
|     4| Nik| 35| 20999|
+------+----+---+------+

+---+----+---+------+--------+
| id|name|age|salary|    city|
+---+----+---+------+--------+
|  2|Mary| 32| 32000| Chicago|
|  3| Bob| 21| 27000|New York|
|  4| Nik| 35| 20999|   India|
+---+----+---+------+--------+

+---+----+---+------+--------+
| id|name|age|salary|    city|
+---+----+---+------+--------+
|  2|Mary| 32| 32000| Chicago|
|  3| Bob| 21| 27000|New York|
+---+----+---+------+--------+

+---+----+---+------+-----------+---------+
| id|name|age|salary|       city|last_name|
+---+----+---+------+-----------+

------------ Use Above Same Thing In Spark SQL

In [0]:
# fetch the data from given table name..
spark.sql("""
          select * from manual_schema_df_tbl
          """).show()


# use alias, filter, casting, adding new column etc.
spark.sql("""
          select id as emp_id, name, cast(age as long), salary, 'deo' as last_name from manual_schema_df_tbl
          where salary > 20000
          """).show()

# show the schema..
spark.sql("""
          select id as emp_id, name, cast(age as long), 'deo' as last_name from manual_schema_df_tbl
          """).printSchema()

+---+----+---+------+-----------+
| id|name|age|salary|       city|
+---+----+---+------+-----------+
|  1|John| 25| 20000|Los Angales|
|  2|Mary| 32| 32000|    Chicago|
|  3| Bob| 21| 27000|   New York|
|  4| Nik| 35| 20999|      India|
+---+----+---+------+-----------+

+------+----+---+------+---------+
|emp_id|name|age|salary|last_name|
+------+----+---+------+---------+
|     2|Mary| 32| 32000|      deo|
|     3| Bob| 21| 27000|      deo|
|     4| Nik| 35| 20999|      deo|
+------+----+---+------+---------+

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- last_name: string (nullable = false)

