In [1]:
!pip install pyspark
!pip install findspark



In [2]:
import findspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, when
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
findspark.init()

In [3]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = (SparkSession.builder
                     .appName("Python Spark DataFrames basic example")
                     .config("spark.some.config.option", "some-value")
                     .getOrCreate())

In [4]:
airport_schema = StructType([
    StructField('faa', StringType(), True),
    StructField('name', StringType(), True),
    StructField('lat', FloatType(), True),
    StructField('lon', FloatType(), True),
    StructField('alt', IntegerType(), True),
    StructField('tz', FloatType(), True),
    StructField('dst', StringType(), True)
])

df_airport = (spark.read.schema(airport_schema)
                       .option('header', 'true')
                       .csv('airports.csv'))

df_airport.printSchema()

AnalysisException: Path does not exist: file:/C:/Users/macalbuq/airports.csv

In [None]:
# Criar o restante das linhas depois para testar todas as regras;
def rows_for_test(df):
    vals = [
        (None, None, None, None, None, None, None),
        ('', '', '', '', '', '', ''),
        ('AAA', None, None, '-80.Aa6195833', '1044Aa'),
    ]

    new_rows = spark.createDataFrame(vals, df.columns)

    df = df.union(new_rows)
    
    return df

df_airport = rows_for_test(df_airport)

### Airport - Perguntas

#### Pergunta 1

In [None]:
df_airport = df_airport.withColumn("qa_faa", (
               F.when(
                     (F.col('faa').isNull()) |
                     (F.col('faa') == '') |
                     (F.col('faa').rlike('\t') |
                     (F.col('faa').rlike(' +'))), 'M')
                .when(
                     (F.length(F.col('faa')) < 3) & 
                     (F.length(F.col('faa')) > 5) &
                     ((F.col('faa').rlike('^[a-zA-Z ]*$')) | (F.col('faa').rlike('\d*$'))), 'F')
                ))

df_airport.show(5)

#### Pergunta 2

In [None]:
qa_name_column = F.when((F.col('name').isNull()) | (F.col('name') == ''), 'M')

df_airport = df_airport.withColumn('qa_name', qa_name_column)
    
df_airport.show(5)

#### Pergunta 3

In [None]:
qa_lat_F.column = (F.when(
                    (F.col('lat').isNull()) | 
                    (F.col('lat') == ''), 'M')
                .when(
                    (F.col('lat') <= -180) | 
                    (F.col('lat') >= 180), 'I')
                .when(
                    F.col('lat').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))

df_airport = df_airport.withF.column('qa_lat', qa_lat_F.column)

df_airport.show(5)

#### Pergunta 4

In [None]:
qa_lon_column = (F.when(
                    (F.col('lon').isNull()) | 
                    (F.col('lon') == ''), 'M')
                .when(
                    (F.col('lon') <= -180) | 
                    (F.col('lon') >= 180), 'I')
                .when(
                    col('lon').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))

df_airport = df_airport.withColumn('qa_lon', qa_lon_column)

df_airport.show(5)

#### Pergunta 5

In [None]:
qa_alt_column = (F.when(
                    (F.col('alt').isNull()) | 
                    (F.col('alt') == ''), 'M')
                .when(
                    col('alt') < 0, 'I')
                .when(
                    col('alt').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))

df_airport = df_airport.withColumn('qa_alt', qa_alt_column)

df_airport.show(5)

#### Pergunta 6

In [None]:
qa_tz_column = (F.when(
                    (F.col('tz').isNull()) | 
                    (F.col('tz') == ''), 'M')
                .when(
                    (F.col('tz') < -11) | 
                    (F.col('tz') > 14), 'I')
                .when(
                    col('tz').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))

df_airport = df_airport.withColumn('qa_tz', qa_tz_column)

df_airport.show(5)

#### Pergunta 7

In [None]:
category_list = ['E', 'A', 'S', 'O', 'Z', 'N', 'U']
qa_dst_column = (F.when(
                    (F.col('dst').isNull()) | 
                    (F.col('dst') == ''), 'M')
                .when(
                    col('dst').isin(category_list) == False, 'C')
                .when(
                    col('dst').rlike('^[0-9]*$'), 'A'))

df_airport = df_airport.withColumn('qa_dst', qa_dst_column)

df_airport.show(5)

# Planes - Perguntas

In [None]:
schema_types = StructType([
    StructField('tailnum', StringType(), True),
    StructField('year', IntegerType(), True),
    StructField('type', StringType(), True),
    StructField('manufacturer', StringType(), True),
    StructField('model', StringType(), True),
    StructField('engines', IntegerType(), True),
    StructField('seats', IntegerType(), True),
    StructField('speed', IntegerType(), True),
    StructField('engine', StringType(), True)
])

df_planes = (spark.read.schema(schema_types)
                       .option('header', 'true')
                       .csv('planes.csv'))

df_planes.printSchema()

#### Pergunta 1

In [None]:
invalid_characters_list = ['I', 'O', 0]
qa_tailnum_column = (F.when(
                        (F.col('tailnum').isNull()) | 
                        (F.col('tailnum') == ''), 'M')
                    .when(
                        F.length(F.col('tailnum')) != 5, 'S')
                    .when(
                        col('tailnum').rlike('^[0-9]*$'), 'A')
                    .when(
                        (F.col('tailnum').substr(1, 1) != 'N') & (F.col('tailnum').substr(-1, -1) != 'Z'), 'FN')
                    .when(
                        col('tailnum').substr(1, 1).isin(invalid_characters_list), 'FE'))


df_planes = df_planes.withColumn('qa_tailnum', qa_tailnum_column)

df_planes.show(5)

#### Pergunta 2

In [None]:
qa_year_column = (F.when(
                    (F.col('year').isNull()) | 
                    (F.col('year') == ''), 'M')
                  .when(
                    col('year') < 1950, 'I'))

df_planes = df_planes.withColumn('qa_year', qa_year_column)

df_planes.show(5)

#### Pergunta 3

In [None]:
categorys_list = [
    'Fixed wing multi engine',
    'Fixed wing single engine',
    'Rotorcraft'
]

qa_type_column = (F.when(
                    (F.col('type').isNull()) | 
                    (F.col('type') == ''), 'M')
                 .when(
                    col('type').isin(categorys_list) == False, 'C'))

df = df_planes.withColumn('qa_type', qa_type_column)


df_planes.show(5)

#### Pergunta 4

In [None]:
categorys_list = [
    'AIRBUS',
    'AIRBUS INDUSTRIE',
    'BOEING',
    'BOMBARDIER',
    'CESSNA',
    'EMBRAER',
    'SIKORSKY',
    'CANADAIR',
    'PIPER',
    'MCDONNELL DOUGLAS',
    'CIRRUS',
    'BELL',
    'KILDALL GARY',
    'LAMBERT RICHARD',
    'BARKER JACK',
    'ROBINSON HELICOPTER',
    'GULFSTREAM',
    'MARZ BARRY'
]

qa_manufacturer_column = (F.when(
                            (F.col('manufacturer').isNull()) | 
                            (F.col('manufacturer') == ''), 'M')
                         .when(
                            col('manufacturer').isin(categorys_list) == False, 'C'))

df_planes = df_planes.withColumn('qa_manufacturer', qa_manufacturer_column)

df_planes.show(5)

#### Pergunta 5

In [None]:
qa_model_column = (F.when(
                    (F.col('model').isNull()) | 
                    (F.col('model') == ''), 'M')
                  .when(
                    (F.col('manufacturer').contains('AIRBUS')) &
                    (F.col('model').substr(1, 1) != 'A'), 'F')
                  .when(
                    (F.col('manufacturer').contains('BOEING')) &
                    (F.col('model').substr(1, 1) != '7'), 'F')
                  .when(
                    (F.col('manufacturer').contains('BOMBARDIER')) | 
                    (F.col('manufacturer').contains('CANADAIR')) & 
                    (F.col('model').substr(1, 2) != 'CL'), 'F')
                  .when(
                    (F.col('manufacturer').contains('MCDONNELL DOUGLAS')) & 
                    ((F.col('model').substr(1, 2) != 'MD') | (F.col('model').substr(1, 2) != 'DC')), 'F'))


df_planes = df_planes.withColumn('qa_model', qa_model_column)

df_planes.show(5)

#### Pergunta 6

In [None]:
qa_engines_column = (F.when(
                        (F.col('engines').isNull()) | 
                        (F.col('engines') == ''), 'M')
                    .when(
                        (F.col('engines') < 1) & (F.col('engines') > 4), 'I')
                    .when(
                        col('engines').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))


df_planes = df_planes.withColumn('qa_engines', qa_engines_column)

df_planes.show(5)

#### Pergunta 7

In [None]:
qa_seats_column = (F.when(
                    (F.col('seats').isNull()) | 
                    (F.col('seats') == ''), 'M')
                .when(
                    (F.col('seats') < 2) & (F.col('engines') > 500), 'I')
                .when(
                    col('seats').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))

df_planes = df_planes.withColumn('qa_seats', qa_seats_column)

df_planes.show(5)

#### Pergunta 8

In [None]:
qa_speed_column = (F.when(
                    (F.col('speed').isNull()) | 
                    (F.col('speed') == ''), 'M')
                .when(
                    (F.col('speed') < 50) & (F.col('speed') > 150), 'I')
                .when(
                    col('speed').rlike('/^\w*?[a-zA-Z]\w*$/'), 'A'))

df_planes = df_planes.withColumn('qa_speed', qa_speed_column)

df_planes.show(5)

#### Pergunta 9

In [None]:
category_list = [
    'Turbo-fan',
    'Turbo-jet',
    'Turbo-prop',
    'Turbo-shaft',
    '4 Cycle'
]
qa_engine_column = (F.when(
                      (F.col('engine').isNull()) | 
                      (F.col('engine') == ''), 'M')
                  .when(
                      col('engine').isin(category_list) == False, 'C'))

df_planes = df_planes.withColumn('qa_engine', qa_engine_column)

df_planes.show(5)

# Flights - Perguntas

In [None]:
flights_schema = StructType([
    StructField('year', IntegerType(), True),
    StructField('month', IntegerType(), True),
    StructField('day', IntegerType(), True),
    StructField('hour', IntegerType(), True),
    StructField('minute', IntegerType(), True),
    StructField('dep_time', StringType(), True),
    StructField('arr_time', StringType(), True),
    StructField('dep_delay', IntegerType(), True),
    StructField('arr_delay', IntegerType(), True),
    StructField('carrier', StringType(), True),
    StructField('tailnum', StringType(), True),
    StructField('flight', StringType(), True),
    StructField('origin', StringType(), True),
    StructField('dest', StringType(), True),
    StructField('air_time', FloatType(), True),
    StructField('distance', FloatType(), True),
])

df_flights = (spark.read.schema(flights_schema)
                       .option('header', 'true')
                       .csv('flights.csv'))

df_flights.printSchema()
df_flights.show()

#### Pergunta 1

In [None]:
qa_year_month_day_column = (F.when(
                      (F.col('year').isNull()) | 
                      (F.col('year') == ''), 'MY')
                  .when(
                      (F.col('month').isNull()) | 
                      (F.col('month') == ''), 'MM')
                  .when(
                      (F.col('day').isNull()) | 
                      (F.col('day') == ''), 'MD')
                  .when(
                      col('year') < 1950, 'IY')
                  .when(
                      (F.col('month') < 1) | 
                      (F.col('month') > 12), 'IM')
                  .when(
                      (F.col('month') != 2) &
                      (F.col('day') < 1) |
                      (F.col('day') > 31), 'ID')
                  .when(
                      (F.col('month') == 2) &
                      (F.col('day') < 1) |
                      (F.col('day') > 29), 'ID'))

df_flights = df_flights.withColumn('qa_year_month_day', qa_year_month_day_column)

df_flights.show(5)

#### Pergunta 2

In [None]:
qa_hour_minute_column = (F.when(
                            (F.col('hour').isNull()) | 
                            (F.col('hour') == ''), 'MH')
                        .when(
                            (F.col('minute').isNull()) | 
                            (F.col('minute') == ''), 'MM')
                        .when(
                            ((F.length(F.col('hour')) == 3) & ((F.col('hour').substr(1,1) < 0) | (F.col('hour').substr(1,1) > 24))) |
                            ((F.length(F.col('hour')) == 4) & ((F.col('hour').substr(1,2) < 0) | (F.col('hour').substr(1,2) > 24))), 'IH')
                        .when(
                            ((F.length(F.col('minute')) == 3) & ((F.col('hour').substr(2,3) < 0) | (F.col('hour').substr(2,3) > 59))) |
                            ((F.length(F.col('minute')) == 4) & ((F.col('hour').substr(3,4) < 0) | (F.col('hour').substr(3,4) > 59))), 'IM'))

df_flights = df_flights.withColumn('qa_hour_minute', qa_hour_minute_column)

df_flights.show(5)

#### Pergunta 3

In [None]:
qa_dep_arr_time_column = (F.when(
                            (F.col('dep_time').isNull()) | 
                            (F.col('dep_time') == ''), 'MD')
                         .when(
                            (F.col('arr_time').isNull()) | 
                            (F.col('arr_time') == ''), 'MA')
                         .when(
                            (F.col('dep_time').rlike('^([0-1]?[0-9]|2[0-3])[0-5][0-9]$') == False), 'FD')
                         .when(
                            (F.col('arr_time').rlike('^([0-1]?[0-9]|2[0-3])[0-5][0-9]$') == False), 'FA'))

df_flights = df_flights.withColumn('qa_dep_arr_time', qa_dep_arr_time_column)

df_flights.show(5)

#### Pergunta 4

In [None]:
qa_dep_arr_delay_column = (F.when(
                              (F.col('dep_delay').isNull()) | 
                              (F.col('dep_delay') == ''), 'MD')
                          .when(
                              (F.col('arr_delay').isNull()) | 
                              (F.col('arr_delay') == ''), 'MA'))

df_flights = df_flights.withColumn('qa_dep_arr_delay', qa_dep_arr_time_column)


df_flights.show(5)

#### Pergunta 5

In [None]:
qa_carrier_column = (F.when(
                      (F.col('carrier').isNull()) | 
                      (F.col('carrier') == ''), 'M')
                    .when(
                      (F.col('carrier').rlike('/^\w*?[a-zA-Z]\w*$/') == False) &
                      (F.length(F.col('tailnum')) != 2), 'F'))

df_flights = df_flights.withColumn('qa_carrier', qa_carrier_column)

df_flights.show(5)

#### Pergunta 6

In [None]:
invalid_characters_list = ['I', 'O', 0]
qa_tailnum_column = (F.when(
                        (F.col('tailnum').isNull()) | 
                        (F.col('tailnum') == ''), 'M')
                    .when(
                        F.length(F.col('tailnum')) != 5, 'S')
                    .when(
                        col('tailnum').rlike('^[0-9]*$'), 'A')
                    .when(
                        (F.col('tailnum').substr(1, 1) != 'N') & (F.col('tailnum').substr(-1, -1) != 'Z'), 'FN')
                    .when(
                        col('tailnum').substr(1, 1).isin(invalid_characters_list), 'FE'))


df_flights = df_flights.withColumn('qa_tailnum', qa_tailnum_column)

df_flights.show(5)

#### Pergunta 7

In [None]:
qa_flight_column = (F.when(
                        (F.col('flight').isNull()) | 
                        (F.col('flight') == ''), 'M')
                   .when(
                        (F.length((F.col('flight')) != 4) & col('flight').rlike('^[0-9]*$') == False, 'F'))


df_flights = df_flights.withColumn('qa_flight', qa_flight_column)

df_flights.groupBy('qa_flight').agg(F.count('qa_flight')).show()

#### Pergunta 8

In [None]:
qa_origin_dest_column = (F.when(
                            (F.col('origin').isNull()) | 
                            (F.col('origin') == ''), 'M')
                        .when(
                            (F.col('dest').isNull()) | 
                            (F.col('dest') == ''), 'M')
                        .when(
                            (F.length(F.col('origin')) != 3) & col('origin').rlike('/^\w*?[a-zA-Z]\w*$/') == False, 'F')
                        .when(
                            (F.length(F.col('dest')) != 3) & (F.col('dest').rlike('/^\w*?[a-zA-Z]\w*$/')) == False, 'F')
                        )

df_flights = df_flights.withColumn('qa_origin_dest', qa_origin_dest_column)

df_flights.show(5)

#### Pergunta 9

In [None]:
qa_air_time_column = (F.when(
                        (F.col('air_time').isNull()) | 
                        (F.col('air_time') == ''), 'M')
                      .when(
                        (F.col('air_time') < 20) & (F.col('air_time') > 500), 'I')
                      )

df_flights = df_flights.withColumn('qa_air_time', qa_air_time_column)

df_flights.show(5)

#### Pergunta 10

In [None]:
qa_distance_column = (F.when(
                        (F.col('distance').isNull()) | 
                        (F.col('distance') == ''), 'M')
                      .when(
                        (F.col('distance') < 50) & (F.col('air_time') > 3000), 'I')
                      )

df_flights = df_flights.withColumn('qa_distance', qa_distance_column)

df_flights.show(5)

#### Pergunta 11

In [None]:
qa_distance_airtime_column = (F.when(
                                (F.col('distance').isNull()) | 
                                (F.col('distance') == ''), 'M')
                             .when(
                                (F.col('air_time') >= (F.col('distance') * 0.1 + 30)), 'TL')
                             .when(
                                (F.col('air_time') <= (F.col('distance') * 0.1 + 10)), 'TS')
                             .when(
                                (F.col('air_time') <= (F.col('distance') * 0.1 + 30)) | 
                                (F.col('air_time') >= (F.col('distance') * 0.1 + 10)), 'TR')
                              )

df_flights = df_flights.withColumn('qa_distance_airtime', qa_distance_airtime_column)

df_flights.show(5)