In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark

In [None]:
# starting spark
import findspark
findspark.init()

In [None]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, lit, udf, length, substring, expr, regexp_replace, sum_distinct
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType
from pyspark.sql import functions as F

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = (SparkSession.builder
                     .master("local[7]")
                     .appName("Aceleração PySpark - Capgemini"))

In [None]:
path = 'C:/Users/coskata/Downloads/Datasets/parquet/airports.parquet'
df_airports = spark.read.parquet(path)

path = 'C:/Users/coskata/Downloads/Datasets/parquet/flights.parquet'
df_flights = spark.read.parquet(path)

path = 'C:/Users/coskata/Downloads/Datasets/parquet/planes.parquet'
df_planes = spark.read.parquet(path)

# Qualidade

# Pergunta 1

In [None]:
df_planes = df_planes.withColumnRenamed("tailnum", "tailnum_planes")\
.withColumnRenamed("qa_tailnum", "qa_tailnum_planes")

In [None]:
df_flights = df_flights.withColumnRenamed("tailnum", "tailnum_flights")\
.withColumnRenamed("qa_tailnum", "qa_tailnum_flights")

In [None]:
cond = [
    df_flights.tailnum_flights == df_planes.tailnum_planes, 
]

df_j1 = df_flights.join(df_planes, cond, 'left')

In [None]:
cond = [
    df_j1.origin == df_airports.faa
]

df_j2 = df_j1.join(df_airports, cond, 'left')

In [None]:
df_airports2 = df_airports.withColumnRenamed("faa", "faa_a")\
.withColumnRenamed("name", "name_a")\
.withColumnRenamed("qa_faa", "qa_faa_a")\
.withColumnRenamed("qa_name", "qa_name_a")\
.withColumnRenamed("qa_lat", "qa_lat_a")\
.withColumnRenamed("qa_lon", "qa_lon_a")\
.withColumnRenamed("qa_alt", "qa_alt_a")\
.withColumnRenamed("qa_tz", "qa_tz_a")\
.withColumnRenamed("qa_dst", "qa_dst_a")

In [None]:
cond = [
    df_j2.dest == df_airports2.faa_a
]
df_j3 = df_j2.join(df_airports2, cond, 'left')

# Teste do resultado

In [None]:
df_j3.show()

# Pergunta 2

In [None]:
df_qa = df_j3.select(df_j3.colRegex("`^qa_.*`"))

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'M').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'F').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'I').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'S').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'T').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

# Pergunta 3

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'M').groupBy(F.substring(c, 1, 1).alias(c)).count().show()
    


# Pergunta 4

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'F').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

# Pergunta 5

In [None]:
for c in df_qa.columns:
#     print(c)
    df_qa.filter(F.col(c) == 'I').groupBy(F.substring(c, 1, 1).alias(c)).count().show()

# Negócio

# Pergunta 1

In [None]:
path = 'C:/Users/coskata/Downloads/Datasets/parquet/airports3.parquet'
df_airports = spark.read.parquet(path)

path = 'C:/Users/coskata/Downloads/Datasets/parquet/flights3.parquet'
df_flights = spark.read.parquet(path)

path = 'C:/Users/coskata/Downloads/Datasets/parquet/planes3.parquet'
df_planes = spark.read.parquet(path)

In [None]:
df_planes = df_planes.withColumnRenamed("tailnum", "tailnum_p")

In [None]:
cond = [
    df_flights.tailnum == df_planes.tailnum_p, 
]

df_j1 = df_flights.join(df_planes, cond, 'left')

In [None]:
cond = [
    df_j1.origin == df_airports.faa
]

df_j2 = df_j1.join(df_airports, cond, 'left')

In [None]:
df_airports2 = df_airports.withColumnRenamed("faa", "faa_a")\
.withColumnRenamed("name", "name_a")\
.withColumnRenamed("lat", "lat_a")\
.withColumnRenamed("lon", "lon_a")\
.withColumnRenamed("alt", "alt_a")\
.withColumnRenamed("tz", "tz_a")\
.withColumnRenamed("dst", "dst_a")\
.withColumnRenamed("region", "region_a")\
.withColumnRenamed("type", "type_a")\
.withColumnRenamed("military", "military_a")\
.withColumnRenamed("administration", "administration_a")

In [None]:
cond = [
    df_j2.dest == df_airports2.faa_a
]
df_j3 = df_j2.join(df_airports2, cond, 'left')

# Pergunta 2

In [None]:
df_j3.groupBy('name').count().show()

# Pergunta 3

In [None]:
df3_1 = df_j3.withColumn(
    'dif_alt',
    F.abs(expr('alt - alt_a'))
    )

In [None]:
df3_1.groupBy('dif_alt').count().distinct().orderBy("count", ascending=False).show()

In [None]:
# Pergunta 4 

In [None]:
df4_1 = df_j3.groupBy('origin', 'dest').avg('dep_delay')
df4_1.show()

In [None]:
df4_2 = df_j3.groupBy('origin', 'dest').avg('arr_delay')
df4_2.show()

In [None]:
# Pergunta 5

In [None]:
df5_1 = df_j3.groupBy('region').avg('dep_delay')
df5_1.show()

In [None]:
df5_2 = df_j3.groupBy('region').avg('arr_delay')
df5_2.show()

In [None]:
df5_3 = df_j3.groupBy('region_a').avg('dep_delay')
df5_3.show()

In [None]:
df5_4 = df_j3.groupBy('region_a').avg('arr_delay')
df5_4.show()

In [None]:
# Pergunta 6

In [None]:
df6_1 = df_j3.groupBy().sum('dep_delay')
df6_1.show()

In [None]:
df6_2 = df_j3.groupBy().sum('arr_delay')
df6_2.show()

In [None]:
# Pergunta 7

In [None]:
df7_1 = df_j3.groupBy('region').sum('dep_delay')
df7_1.show()

In [None]:
df7_2 = df_j3.groupBy('region').sum('arr_delay')
df7_2.show()

In [None]:
df7_3 = df_j3.groupBy('region_a').sum('dep_delay')
df7_3.show()

In [None]:
df7_4 = df_j3.groupBy('region_a').sum('arr_delay')
df7_4.show()

In [None]:
# Pergunta 8

In [None]:
df8 = df_j3.groupBy().avg('air_time')
df8.show()

In [None]:
# Pergunta 9

In [None]:
df9_1 = df_j3.groupBy('region').avg('air_time')
df9_1.show()

In [None]:
df9_2 = df_j3.groupBy('region_a').avg('air_time')
df9_2.show()

In [None]:
# Pergunta 10

In [None]:
df10 = df_j3.groupBy('origin', 'dest').avg('air_time')
df10.show()

In [None]:
# Pergunta 11

In [None]:
df11 = df_j3.groupBy().sum('air_time')
df11.show()

In [None]:
# Pergunta 12

In [None]:
df12_1 = df_j3.groupBy('region').sum('air_time')
df12_1.show()

In [None]:
df12_2 = df_j3.groupBy('region_a').sum('air_time')
df12_2.show()

In [None]:
# Pergunta 13

In [None]:
df13 = df_j3.groupBy().avg('distance')
df13.show()

In [None]:
# Pergunta 14

In [None]:
df14_1 = df_j3.groupBy('region').avg('air_time')
df14_1.show()

In [None]:
df14_2 = df_j3.groupBy('region_a').avg('air_time')
df14_2.show()

In [None]:
# Pergunta 15

In [None]:
df15 = df_j3.groupBy('origin', 'dest').avg('distance')
df15.show()

In [None]:
# Pergunta 16

In [None]:
df16 = df_j3.groupBy().sum('distance')
df16.show()

In [None]:
# Pergunta 17

In [None]:
df17_1 = df_j3.groupBy('region').sum('distance')
df17_1.show()

In [None]:
df17_2 = df_j3.groupBy('region_a').sum('distance')
df17_2.show()

In [None]:
# Pergunta 18

In [None]:
df18 = df_j3.groupBy('origin', 'dest').avg('seats')
df18.show()

In [None]:
# Pergunta 19

In [None]:
df19 = df_j3.groupBy(F.year('dep_datetime').alias('ano')).sum('seats')
df19.show()

In [None]:
# Pergunta 20

In [None]:
df20 = df_j3.groupBy('dest', 'name_a').count().distinct().orderBy('count',ascending=False)
df20.show()

In [None]:
# Pergunta 21

In [None]:
df20 = df_j3.groupBy('dest', 'name_a').count().distinct().orderBy('count',ascending=False)
df20.show()

In [None]:
df21 = df_j3.groupBy('dest', 'name_a').sum('seats').orderBy('sum(seats)',ascending=False)
df21.show()

In [None]:
# Pergunta 22

In [None]:
df22 = df_j3.groupBy('faa', 'faa_a','distance').count().distinct().orderBy('distance', ascending=False)

In [None]:
df22.where((col('faa') == 'PDX')).show()

In [None]:
df22.where(col('faa_a') == 'PDX').show()

In [None]:
# Pergunta 23

In [None]:
df23 = df_j3.groupBy('dest', F.month('dep_datetime').alias('mes')).count().distinct()
df23.orderBy('mes').show()

In [None]:
# Pergunta 24

In [None]:
df24 = df_j3.groupBy('model').count().distinct().orderBy('count', ascending=False)
df24.show()

In [None]:
# Pergunta 25

In [None]:
df25 = df_j3.groupBy('dest', 'model').count().distinct()
df25.orderBy('count', ascending=False).show()

In [None]:
# Pergunta 26

In [None]:
df26 = df_j3.groupBy('haul_duration').avg('engines')
df26.orderBy('avg(engines)', ascending=False).show()

In [None]:
# Pergunta 27

In [None]:
df27 = df_j3.groupBy('dep_season').count().distinct()
df27.orderBy('count', ascending=False).show()

In [None]:
# Pergunta 28

In [None]:
df28 = df_j3.groupBy('dest', 'dep_season').count().distinct()
df28.orderBy('count', ascending=False).show()

In [None]:
# Pergunta 29

In [None]:
df29 = df_j3.groupBy('dep_delay_category').count().distinct()
df29.orderBy('count',ascending=False).show()

In [None]:
# Pergunta 30

In [None]:
df30 = df_j3.groupBy('origin','dest','dep_delay_category').count().distinct()
df30.orderBy('count',ascending=False).show()