# Demo Pipeline Linear Regression

### Dataset: flights.csv
- You'll build a regression model to predict flight duration 
- With dow, org, mile as a predictor

First thing to do is start a Spark Session

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark

In [None]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [None]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("flights.csv",inferSchema=True,header=True)

In [None]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [None]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [None]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [None]:
# for item in data.head():
#     print(item)

In [None]:
data.count()

50000

In [None]:
# Remove the 'flight' column
data = data.drop('flight')

In [None]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [None]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [None]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [None]:
# Import the required function
from pyspark.sql.functions import round

In [None]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [None]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
data = data.withColumn('label', (data.delay >= 15).cast('integer'))
# Check first five records
data.show(5)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
| 11| 20|  6|     US|JFK|2153|  9.48|     351|   NA|3465.0| null|
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|1989.0|    0|
|  4|  2|  5|     AA|ORD| 258|  8.92|      65|   NA| 415.0| null|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 5 rows



In [None]:
final_data = data
final_data.count()

final_data = final_data.na.drop()
final_data.count()

final_data.show(5)

+---+---+---+-------+---+----+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|    km|label|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+----+------+--------+-----+------+-----+
only showing top 5 rows



# Thực hiện Pipeline
- ...