## Select Columns From DataFrame

`select()`` function is used to select single, multiple, column by index, all columns from the list and the nested columns from a DataFrame, PySpark `select()`` is a transformation function hence it returns a new DataFrame with the selected columns.

In [0]:
dbutils.library.restartPython() # Removes Python state, but some libraries might not work without calling this command.dbutils.restartPython()

#### Load libraries

In [0]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import IntegerType, DateType, StringType, StructType, StructField, ArrayType, MapType
from pyspark.sql.functions import lit, col, expr, when

#### Create Spark session

In [0]:
spark = SparkSession.builder.appName('Select Columns From DataFrame').getOrCreate()

#### Prepare Data

In [0]:
data = [
  ('John', '', 'Smith', '36636', 'M', 2500),
  ('Jane', '', 'Doe', '42114', 'F', 500),
  ('Richard', 'Laurence', 'Marquette', '97086', 'M', 1500),
  ('Israel', '', 'Israeli', '', 'M', 3000),
  ('Edward', 'III', '', 'SL4', 'M', 5000)
]
 
schema = StructType([
  StructField('firstname', StringType(),True),
  StructField('middlename', StringType(),True),
  StructField('lastname', StringType(),True),
  StructField('zip', StringType(), True),
  StructField('gender', StringType(), True),
  StructField('salary', IntegerType(), True)
])

columns = schema.fieldNames()

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)

#### Select All Columns From List

In [0]:
df.select('firstname','lastname').show()

In [0]:
df.select(df.firstname,df.lastname).show()

In [0]:
df.select(df['firstname'],df['lastname']).show()

In [0]:
df.select(col('firstname'),col('lastname')).show()

In [0]:
df.select(df.colRegex('`^.*name*`')).show()

#### Select All Columns From List

In [0]:
df.select(*columns).show()

In [0]:
# Create DataFrame with struct using Row class
data1=[
  Row(name='John',prop=Row(hair='black',eye='brown')),
  Row(name='Marie',prop=Row(hair='blond',eye='black'))
]

df1=spark.createDataFrame(data1)
df1.printSchema()

In [0]:
#Access struct column
df1.select(
  df1.prop.hair,
  df1['prop.hair'],
  col('prop.hair')
).show()

In [0]:
# Access all columns from struct
df1.select(col('prop.*')).show()

In [0]:
df1.select(df1.columns).show()

In [0]:
df1.select('*').show()

#### Select Columns by Index

In [0]:
#Selects first 3 columns and top 3 rows
df.select(df.columns[:3]).show(3)

In [0]:
#Selects columns 2 to 4  and top 3 rows
df.select(df.columns[2:4]).show(3)

#### Select Nested Struct Columns from PySpark

In [0]:
dataNested = [
  (('John', '', 'Smith'), '36636', 'M', 2500),
  (('Jane', '', 'Doe'), '42114', 'F', 500),
  (('Richard', 'Laurence', 'Marquette'), '97086', 'M', 1500),
  (('Israel', '', 'Israeli'), '', 'M', 3000),
  (('Edward', 'III', ''), 'SL4', 'M', 5000)
]
 
schemaNested = StructType([
  StructField('name', StructType([
    StructField('firstname', StringType(),True),
    StructField('middlename', StringType(),True),
    StructField('lastname', StringType(),True)
  ])),
  StructField('zip', StringType(), True),
  StructField('gender', StringType(), True),
  StructField('salary', IntegerType(), True)
])


dfNested = spark.createDataFrame(data=dataNested, schema=schemaNested)
dfNested.printSchema()
dfNested.show(truncate=False)

In [0]:
dfNested.select('name').show(truncate=False)

In [0]:
dfNested.select('name.firstname','name.lastname').show(truncate=False)

In [0]:
dfNested.select('name.*').show(truncate=False)

#### The end of the notebook