In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,IntegerType

In [2]:
spark = (SparkSession.builder.appName("spark").getOrCreate())

In [3]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

In [4]:
schema = StructType([
    StructField("firstName",StringType(),True),
    StructField("midName",StringType(),True),
    StructField("lastName",StringType(),True),
    StructField("id",StringType(),True),
    StructField("gender",StringType(),True),
    StructField("salary",IntegerType(),True)
])

In [5]:
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()

root
 |-- firstName: string (nullable = true)
 |-- midName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [6]:
df.show(truncate=False)

+---------+-------+--------+-----+------+------+
|firstName|midName|lastName|id   |gender|salary|
+---------+-------+--------+-----+------+------+
|James    |       |Smith   |36636|M     |3000  |
|Michael  |Rose   |        |40288|M     |4000  |
|Robert   |       |Williams|42114|M     |4000  |
|Maria    |Anne   |Jones   |39192|F     |4000  |
|Jen      |Mary   |Brown   |     |F     |-1    |
+---------+-------+--------+-----+------+------+



In [14]:
file_path = "flight-summary.csv"
fire_df = (spark.read.format("csv")
          .option("header",True)
          .option("inferSchema",True)
          .load(file_path))

In [15]:
fire_df.select("origin_code","origin_airport","origin_state","count").show(10)

+-----------+--------------------+------------+-----+
|origin_code|      origin_airport|origin_state|count|
+-----------+--------------------+------------+-----+
|        BQN|Rafael Hern√°ndez ...|          PR|  441|
|        PHL|Philadelphia Inte...|          PA| 4869|
|        MCI|Kansas City Inter...|          MO| 1698|
|        SPI|Abraham Lincoln C...|          IL|  998|
|        SNA|John Wayne Airpor...|          CA| 3846|
|        LBB|Lubbock Preston S...|          TX|  618|
|        ORD|Chicago O'Hare In...|          IL| 2149|
|        EWR|Newark Liberty In...|          NJ|  239|
|        ATL|Hartsfield-Jackso...|          GA| 2470|
|        MCI|Kansas City Inter...|          MO|  612|
+-----------+--------------------+------------+-----+
only showing top 10 rows



In [19]:
fire_df.printSchema()

root
 |-- origin_code: string (nullable = true)
 |-- origin_airport: string (nullable = true)
 |-- origin_city: string (nullable = true)
 |-- origin_state: string (nullable = true)
 |-- dest_code: string (nullable = true)
 |-- dest_airport: string (nullable = true)
 |-- dest_city: string (nullable = true)
 |-- dest_state: string (nullable = true)
 |-- count: integer (nullable = true)



In [17]:
fire_df.columns

['origin_code',
 'origin_airport',
 'origin_city',
 'origin_state',
 'dest_code',
 'dest_airport',
 'dest_city',
 'dest_state',
 'count']

In [21]:
output_path = 'new'
fire_df.write.format("csv").mode("overwrite").save(output_path)