# Big Data Project

## Configuration

In [None]:
%%configure -f
{"executorMemory":"8G", "numExecutors":2, "executorCores":3, "conf": {"spark.dynamicAllocation.enabled": "false"}}

In [None]:
//Denys: val bucketname = "unibo-bd2122-dgrushchak"
//Riccardo: val bucketname = "unibo-bd2223-rbacca"

val bucketname = "unibo-bd2223-rbacca"

//Denys: val path_flights_db = "s3a://"+bucketname+"/project/small_file_10000000.txt"
//Riccardo:val path_flights_db = "s3a://"+bucketname+"/bigdata-project/part_1.txt"

val path_flights_db = "s3a://"+bucketname+"/bigdata-project/xaa"

sc.applicationId

"SPARK UI: Enable forwarding of port 20888 and connect to http://localhost:20888/proxy/" + sc.applicationId + "/"

In [118]:
object Parser{
    val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
    val doubleVerticalLine = "\\|\\|"
    
    val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)" // not used
    val quotes = "\"" //not used
   
    
    case class FlightData(
        flightDate: String,          //1
        startingAirport: String,     //2
        destinationAirport: String,  //3
        travelDuration: Long,        //4
        airplaneType: Array[String], //5
        airlineName: Array[String], //6
        isNonStop: Boolean,          //7
        isBasicEconomy: Boolean,     //8
        totalFare: Double,           //9
        segmentsDurationInSeconds: Array[Int]   //10
    ){
        def un() = FlightData.unapply(this).get
    }
    
    def convertTravelDuration(value: String): Long = {
        val input = value.replace("PT", "")
        val minutes = input.replace("M", "").split("H") //[2, 35]
        
        (minutes(0).trim.toLong)*60 + minutes(1).trim.toLong
    }
    
    def convertAirplaneType(value: String): Array[String] = {
        //Ex. Airbus A320||Boeing 737-800
        val result = value.split(doubleVerticalLine)
        if (result.head == "") Array.empty
        else result
    }
    
    def convertAirlineName(value: String): Array[String] = {
        val result = value.split(doubleVerticalLine)
        if (result.head == "") Array.empty
        else result
    }
    
    def convertSegmentsDuration(value: String): Array[Int] = {
        val result = value.split(doubleVerticalLine)
        if (result.head == "") Array.empty
        else result.map(_.toInt)
    }
    
     /**
        2 - flightdate (String) ** x._1
        3 - starting airport (String) ** x._2
        4 - destination airport (String) ** x._3
        6 - travel duration (PT2H35M) ** in minutes x._4
        8 - isBasicEconomy (String) ** x._8
        9 - isrefundable (boolean)
        10 - isNonStop (String) ** x._7
        12 - basefare (prezzo base del ticket in $ Double) ** x._9
        12 - seats remaining (Int)
        20 - airline name (company name) ** x._6
        23 - airplane type (Vector of [String||String] ** x._5
        25 - segment distance (distance in milles: Vector of [Int||Int]  can be [None||None])
        24 - segments duration in seconds
    **/
    
     def parseFlightInformationLine(line: String): Option[FlightData] = {
        try {
            val input = line.split(commaRegex)
            
            val airplanes = convertAirplaneType(input(23))
            val airline = convertAirlineName(input(21))
            val segmentsDuration = convertSegmentsDuration(input(24))

            if (airplanes.isEmpty || airline.isEmpty || segmentsDuration.isEmpty) None
            else Some(FlightData(input(2), input(3), input(4), convertTravelDuration(input(6)), airplanes, airline, input(10).toBoolean, input(8).toBoolean, input(12).toDouble, segmentsDuration))
        } catch {
            case _: Exception => None
        }
    }
}

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

defined object Parser


In [None]:
sc.getPersistentRDDs.foreach(_._2.unpersist()) 
val rddFlights = sc.textFile(path_flights_db).flatMap(Parser.parseFlightInformationLine)
val rddFlightsCached = rddFlights.cache()

In [120]:
rddFlightsCached.map(_.un()).take(10).foreach(println)


def round(v: Double): Double = {
    (v*100).toInt/100.toDouble
}

val numberOfRecords = rddFlightsCached.filter(x => x.isNonStop).count

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

(2022-04-17,ATL,BOS,149,[Ljava.lang.String;@2e1a4f5b,[Ljava.lang.String;@4a582977,true,false,248.6)
(2022-04-17,ATL,BOS,150,[Ljava.lang.String;@4ed2d357,[Ljava.lang.String;@38e2287,true,false,248.6)
(2022-04-17,ATL,BOS,150,[Ljava.lang.String;@7f12d53a,[Ljava.lang.String;@748a483e,true,false,248.6)
(2022-04-17,ATL,BOS,152,[Ljava.lang.String;@2c9a6593,[Ljava.lang.String;@5e72e09d,true,false,248.6)
(2022-04-17,ATL,BOS,154,[Ljava.lang.String;@37a0556c,[Ljava.lang.String;@114d201e,true,false,248.6)
(2022-04-17,ATL,BOS,252,[Ljava.lang.String;@6e046959,[Ljava.lang.String;@27bb0a0c,false,false,251.1)
(2022-04-17,ATL,BOS,318,[Ljava.lang.String;@128530a2,[Ljava.lang.String;@6ab0af3c,false,false,251.1)
(2022-04-17,ATL,BOS,332,[Ljava.lang.String;@27334d92,[Ljava.lang.String;@5827d812,false,false,251.1)
(2022-04-17,ATL,BOS,398,[Ljava.lang.String;@631cf394,[Ljava.lang.String;@c3a7984,false,false,251.1)
(2022-04-17,ATL,BOS,286,[Ljava.lang.String;@3158ea4c,[Ljava.lang.String;@57d9533d,false,false,252.

## Explorative queries

1. How many distinct airports and aircraft models
2. Average travel duration for airline, only in non-stop flights
3. Percentage of basic economy tickets, based on all tickets
4. Percentage of non-stop flights (flights with one leg)
5. Average and price range of tickets
6. Average ticket price for each airline
7. Average and range of travel distance
8. Top 10 airports with more arriving flights


In [None]:
//1.How many distinct airports and aircraft models
val distinctAirports = rddFlightsCached.
    map(x => x.startingAirport).
    distinct.union(
        rddFlightsCached.map(x => x.destinationAirport).distinct).
    distinct.count
val distinctAircraftModels = rddFlightsCached.flatMap(x => x.airplaneType).distinct.count

In [None]:
//2. Average travel duration for each airline, in non stop flights
val distinctAirlines = rddFlightsCached.
    filter(x => x.isNonStop).
    map(x => (x.airlineName(0), x.travelDuration)).
    aggregateByKey((0.0, 0.0))((a,v)=>(a._1+v, a._2+1), (a1,a2)=>(a1._1+a2._1, a1._2+a2._2)).
    map({case(k,v)=>(k,v._1/v._2)}).
    collect().
    foreach({case (airline, value) => println(airline + " => " + round(value) + " avg minutes")})

In [None]:
//3. Percentage of basic economy tickets, based on all tickets
round((rddFlightsCached.filter(_.isBasicEconomy).count.toDouble/numberOfRecords).toDouble*100) + " %"

In [None]:
//4. Percentage of non-stop flights (flights with one leg)
round((rddFlightsCached.filter(_.isNonStop).count.toDouble/numberOfRecords).toDouble*100) + " %"

In [None]:
//5. Average and price range of tickets
val ticketPrices = rddFlightsCached.map(x => x.totalFare) // TODO with all dataset try performance with and without a cache
"Range of prices: "  + ticketPrices.min + " to " + ticketPrices.max
"Avg price: " + round((ticketPrices.sum/numberOfRecords).toDouble)

In [None]:
//6. Average ticket price for each airline
val avgTicketPricePerAirline = rddFlightsCached.
    filter(_.isNonStop).
    map(x => (x.airlineName(0),(x.totalFare, 1))).
    reduceByKey((a,b) => (a._1+b._1, a._2+b._2)).
    map(m => (m._1,m._2._1/m._2._2)).
    collect.foreach({case (name, value) => println(name + " => " + round(value))})

In [None]:
//7. Average and range of travel distance

In [None]:
//8. Top 10 airports with more arriving flights
val topAirports = rddFlightsCached.
    map(x => (x.destinationAirport, 1)).
    reduceByKey(_+_).
    sortBy(_._2, false).
    take(10).
    foreach({case (name, value) => println(name + " => " + value)})

**Query approfondite:**

- Denys: aggrego su “aircraft model” per calcolare la “travel distance” totale percorsa da ogni modello, poi faccio self-join e aggregazione per determinare il “travel duration” per ogni “aircraft model”. Alla fine determino la velocità di ogni modello partendo dai dati aggregati.
- Riccardo: aggrego su “aircraft models” per calcolare la classifica discreta dei modelli più usati rispetto a ciascuna “airline”. Eseguo il join col dataset originale, infine riaggrego su “travel duration” e sulla classificazione di prima. Ottengo la durata di media di ogni volo per ogni compagnia (oltre a numero di voli e totale di ore) sul modello di aereo più utilizzato

In [119]:
// Query approfondita --> Riccardo

val zippedRdd = rddFlightsCached.map(x => (x.airplaneType, x.airlineName)).zipWithIndex()
val aircraft = zippedRdd.map(x => (x._2, x._1._1.zipWithIndex)).flatMapValues(x => x).map(x => ((x._1, x._2._2), x._2._1))
val airlines = zippedRdd.map(x => (x._2, x._1._2.zipWithIndex)).flatMapValues(x => x).map(x => ((x._1, x._2._2), x._2._1)) 

val mostUsedModelForAirline = aircraft.join(airlines).map(x => ((x._2._2, x._2._1), 1)).reduceByKey(_+_).sortBy(s => (s._1._1, s._2), false).take(10).foreach(println)

val joinedWithOriginal = mostUsedModelForAirline.join(rddFlightsCached)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

zippedRdd: org.apache.spark.rdd.RDD[((Array[String], Array[String]), Long)] = ZippedWithIndexRDD[545] at zipWithIndex at <console>:25
aircraft: org.apache.spark.rdd.RDD[((Long, Int), String)] = MapPartitionsRDD[548] at map at <console>:23
airlines: org.apache.spark.rdd.RDD[((Long, Int), String)] = MapPartitionsRDD[551] at map at <console>:23
((United,Embraer 175 (Enhanced Winglets)),970)
((United,Boeing 737-800),581)
((United,Airbus A319),571)
((United,Boeing 737-900),551)
((United,Boeing 737 MAX 9),317)
((United,Airbus A320),298)
((United,Embraer 170),261)
((United,Boeing 737-700),154)
((United,Boeing 757-300),94)
((United,Boeing 757-200),85)
mostUsedModelForAirline: Unit = ()


In [None]:
val path_output = "s3a://"+bucketname+"/spark/avgRatPerMovie" //todo