# Dataframe Operations

## Import Libraries

In [42]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StructType, StringType, LongType

## Spark Session

In [25]:
spark = SparkSession\
    .builder\
    .master("local[*]")\
    .appName("Test App")\
    .getOrCreate()
spark.catalog.clearCache()

In [26]:
spark

## Load Data

In [28]:
manual_schema = StructType([
    StructField("some", StringType(), True),
    StructField("column", StringType(), True),
    StructField("names", LongType(), True)
])

In [29]:
my_row = Row("Hello", "World", 42)

In [30]:
my_df = spark.createDataFrame([my_row], schema=manual_schema)

In [31]:
my_df.show()

+-----+------+-----+
| some|column|names|
+-----+------+-----+
|Hello| World|   42|
+-----+------+-----+



In [32]:
df = spark.read\
    .format("csv")\
    .option("inferSchema", True)\
    .option("header", True)\
    .load("../../data/flight-data/csv/*.csv")

### Select

In [33]:
df.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [34]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(3)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Ireland|
|    United States|              India|
+-----------------+-------------------+
only showing top 3 rows



In [35]:
df.select(F.expr("DEST_COUNTRY_NAME AS DESTINATION"), F.expr("ORIGIN_COUNTRY_NAME AS ORIGIN")).show(3)

+-------------+-------+
|  DESTINATION| ORIGIN|
+-------------+-------+
|United States|Romania|
|United States|Ireland|
|United States|  India|
+-------------+-------+
only showing top 3 rows



In [37]:
df.selectExpr("*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS LOCAL_FLIGHT", "(count * 10) AS MULTI_FLIGHT" )

+-----------------+-------------------+------+------------+------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|LOCAL_FLIGHT|MULTI_FLIGHT|
+-----------------+-------------------+------+------------+------------+
|    United States|      United States|348113|        true|     3481130|
|    United States|      United States|370002|        true|     3700020|
|    United States|      United States|352742|        true|     3527420|
|    United States|      United States|343132|        true|     3431320|
|    United States|      United States|347452|        true|     3474520|
+-----------------+-------------------+------+------------+------------+
only showing top 5 rows



In [41]:
df.selectExpr("AVG(count)", "MAX(count)", "MIN(count)", "COUNT(DISTINCT ORIGIN_COUNTRY_NAME)").show()

+------------------+----------+----------+-----------------------------------+
|        avg(count)|max(count)|min(count)|count(DISTINCT ORIGIN_COUNTRY_NAME)|
+------------------+----------+----------+-----------------------------------+
|1718.3189081225032|    370002|         1|                                154|
+------------------+----------+----------+-----------------------------------+

