# Init Spark Context

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [2]:
sparkConfig = SparkConf()

# sparkConfig.set("spark.executor.memory", '8g')
# sparkConfig.set("spark.driver.memory",'8g')
# sparkConfig.set("spark.executor.cores", '4')

spark = SparkSession \
    .builder \
    .appName("pySpark ML") \
    .config(conf=sparkConfig) \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()
spark

# Loading the data

In [3]:
!ls data
# !head data/Airports2.csv

aiport		   Airports2.zip      flights_small.parquet
Airports2.csv	   flights_as_csv     readme.txt
Airports2.parquet  flights_small.csv  traffic_per_months


In [4]:
# # Use the full dataset
# df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("data/Airports2.csv")

# # Save a smaller set of data into some CSV file and parquet file to be used during developement for easy/fast loading
# # Sample 5%
# dfS = df.sample(fraction=0.05, seed=123)
# dfS.printSchema()
# # dfS.coalesce(1).write.parquet('data/flights_as_parquet')
# # dfS.coalesce(1).write.csv('data/flights_as_csv', header=True)

In [5]:
# Load a smaller dataset for smaller iterations
df = spark.read.format('parquet').option("header", "true").option("inferSchema", "true").load("data/flights_small.parquet")

In [6]:
# More info on rows: https://www.kaggle.com/flashgordon/usa-airport-dataset
df.printSchema()

root
 |-- Origin_airport: string (nullable = true)
 |-- Destination_airport: string (nullable = true)
 |-- Origin_city: string (nullable = true)
 |-- Destination_city: string (nullable = true)
 |-- Passengers: integer (nullable = true)
 |-- Seats: integer (nullable = true)
 |-- Flights: integer (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- Fly_date: string (nullable = true)
 |-- Origin_population: integer (nullable = true)
 |-- Destination_population: integer (nullable = true)
 |-- Org_airport_lat: string (nullable = true)
 |-- Org_airport_long: string (nullable = true)
 |-- Dest_airport_lat: string (nullable = true)
 |-- Dest_airport_long: string (nullable = true)



## Define print helper functions

In [7]:
import pandas as pd

def show(df, no=10):  
    return pd.DataFrame(df.take(no), columns=df.columns) 

# Exploratory Data Analysis

In [8]:
show(df)

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long
0,PDX,RDM,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,76034,45.58869934,-122.5979996,44.2541008,-121.1500015
1,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015
2,SFO,RDM,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,76034,37.6189994812012,-122.375,44.2541008,-121.1500015
3,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015
4,SFO,RDM,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,80333,37.6189994812012,-122.375,44.2541008,-121.1500015
5,LMT,RDM,"Klamath Falls, OR","Bend, OR",181,396,22,147,1991-06-01,58281,80333,42.1561012268066,-121.733001708984,44.2541008,-121.1500015
6,PDX,RDM,"Portland, OR","Bend, OR",10,65,1,116,1991-06-01,1578432,80333,45.58869934,-122.5979996,44.2541008,-121.1500015
7,PDX,RDM,"Portland, OR","Bend, OR",1158,3325,175,116,1992-10-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015
8,PDX,RDM,"Portland, OR","Bend, OR",1212,3230,170,116,1992-09-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015
9,SFO,RDM,"San Francisco, CA","Bend, OR",1000,1650,55,462,1993-04-01,7635582,87688,37.6189994812012,-122.375,44.2541008,-121.1500015


## Flights profitability 

In [9]:
df1 = df.withColumn('Passengers_Per_Seats', df.Passengers/df.Seats)
show(df1, 5)

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long,Passengers_Per_Seats
0,PDX,RDM,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,76034,45.58869934,-122.5979996,44.2541008,-121.1500015,0.543659
1,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0.19027
2,SFO,RDM,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,76034,37.6189994812012,-122.375,44.2541008,-121.1500015,0.433735
3,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015,0.164251
4,SFO,RDM,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,80333,37.6189994812012,-122.375,44.2541008,-121.1500015,0.375185


In [10]:
#If Pessengers_Per_Seats >= 40% -> Route is profitable
import pyspark.sql.functions as F
df1 = df1.withColumn('Profitable', F.when(F.col('Passengers_Per_Seats')>=0.4, 1).otherwise(0))
#Droping this to be sure it does not get to the trained features
df1 = df1.drop('Passengers_Per_Seats')

show(df1, 5)

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long,Profitable
0,PDX,RDM,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,76034,45.58869934,-122.5979996,44.2541008,-121.1500015,1
1,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0
2,SFO,RDM,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,76034,37.6189994812012,-122.375,44.2541008,-121.1500015,1
3,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015,0
4,SFO,RDM,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,80333,37.6189994812012,-122.375,44.2541008,-121.1500015,0


## Pre-processing

### Handle null values

In [11]:
#For simplicity, drop the record with null values
df1 = df1.na.drop(how='any')
print('No of rows:',df1.count())

No of rows: 180468


## Feature Extraction

### Converting Fly_date column

In [12]:
# Add month since it's more relevant
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql import functions as sf
df1 = df1.withColumn('year', year('Fly_date'))
df1 = df1.withColumn('month', month('Fly_date'))
show(df1)

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long,Profitable,year,month
0,PDX,RDM,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,76034,45.58869934,-122.5979996,44.2541008,-121.1500015,1,1990,7
1,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1990,3
2,SFO,RDM,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,76034,37.6189994812012,-122.375,44.2541008,-121.1500015,1,1990,2
3,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015,0,1991,5
4,SFO,RDM,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,80333,37.6189994812012,-122.375,44.2541008,-121.1500015,0,1991,11
5,LMT,RDM,"Klamath Falls, OR","Bend, OR",181,396,22,147,1991-06-01,58281,80333,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,1,1991,6
6,PDX,RDM,"Portland, OR","Bend, OR",10,65,1,116,1991-06-01,1578432,80333,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1991,6
7,PDX,RDM,"Portland, OR","Bend, OR",1158,3325,175,116,1992-10-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1992,10
8,PDX,RDM,"Portland, OR","Bend, OR",1212,3230,170,116,1992-09-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1992,9
9,SFO,RDM,"San Francisco, CA","Bend, OR",1000,1650,55,462,1993-04-01,7635582,87688,37.6189994812012,-122.375,44.2541008,-121.1500015,1,1993,4


### Explore the targeted columns

In [13]:
# See the number of items with 1 value
total = df1.count()
withLabelOne = df1.filter(df1.Profitable==1).count()
withLabelZero = df1.filter(df1.Profitable==0).count()
print(withLabelOne, 'out of', total,' are with label 1. Which is approx.', round((withLabelOne/total)*100, 2), "% are with label 1" )
print(withLabelZero, 'out of', total,' are with label 1. Which is approx.', round((withLabelZero/total)*100, 2), "% are with label 1" )

139668 out of 180468  are with label 1. Which is approx. 77.39 % are with label 1
40800 out of 180468  are with label 1. Which is approx. 22.61 % are with label 1


In [14]:
show(df1.sort(F.col('Profitable').asc()))

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long,Profitable,year,month
0,LMT,RDM,"Klamath Falls, OR","Bend, OR",146,888,24,147,1993-11-01,59602,87688,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1993,11
1,FAT,RDM,"Fresno, CA","Bend, OR",0,30,1,521,2009-02-01,915267,158629,36.7761993408203,-119.718002319336,44.2541008,-121.1500015,0,2009,2
2,LMT,RDM,"Klamath Falls, OR","Bend, OR",173,814,22,147,1995-03-01,61140,94869,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1995,3
3,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1990,3
4,LMT,RDM,"Klamath Falls, OR","Bend, OR",14,60,2,147,1995-02-01,61140,94869,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1995,2
5,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015,0,1991,5
6,SEA,RDM,"Seattle, WA","Bend, OR",0,0,0,228,1997-01-01,5727624,101660,47.4490013122559,-122.30899810791,44.2541008,-121.1500015,0,1997,1
7,PDX,RDM,"Portland, OR","Bend, OR",0,0,22,116,2004-06-01,2052776,134080,45.58869934,-122.5979996,44.2541008,-121.1500015,0,2004,6
8,PDX,RDM,"Portland, OR","Bend, OR",0,0,21,116,2007-04-01,2163577,153405,45.58869934,-122.5979996,44.2541008,-121.1500015,0,2007,4
9,PDX,RDM,"Portland, OR","Bend, OR",1158,3325,175,116,1992-10-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1992,10


### Summary statistics for numeric variables

In [15]:
numeric_features = [t[0] for t in df1.dtypes if t[1] == 'int']
df1.select(numeric_features).describe().toPandas()

Unnamed: 0,summary,Passengers,Seats,Flights,Distance,Origin_population,Destination_population,Profitable,year,month
0,count,180468.0,180468.0,180468.0,180468.0,180468.0,180468.0,180468.0,180468.0,180468.0
1,mean,2693.4666367444643,4058.361654143671,37.24090143404925,697.7931045947204,5884765.915780083,5883671.098915043,0.773921138373562,2000.5619223352617,6.552397100871069
2,stddev,4366.102594127457,6236.863210797221,49.72991825644379,606.4125026257003,7882220.889765966,7869409.237389793,0.4182919787383057,5.747517415875842,3.462944537112544
3,min,0.0,0.0,0.0,0.0,13005.0,13005.0,0.0,1990.0,1.0
4,max,85759.0,142058.0,1022.0,4983.0,38139592.0,38139592.0,1.0,2009.0,12.0


### Handle Categorical features

In [16]:
show(df1)

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long,Profitable,year,month
0,PDX,RDM,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,76034,45.58869934,-122.5979996,44.2541008,-121.1500015,1,1990,7
1,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1990,3
2,SFO,RDM,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,76034,37.6189994812012,-122.375,44.2541008,-121.1500015,1,1990,2
3,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015,0,1991,5
4,SFO,RDM,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,80333,37.6189994812012,-122.375,44.2541008,-121.1500015,0,1991,11
5,LMT,RDM,"Klamath Falls, OR","Bend, OR",181,396,22,147,1991-06-01,58281,80333,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,1,1991,6
6,PDX,RDM,"Portland, OR","Bend, OR",10,65,1,116,1991-06-01,1578432,80333,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1991,6
7,PDX,RDM,"Portland, OR","Bend, OR",1158,3325,175,116,1992-10-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1992,10
8,PDX,RDM,"Portland, OR","Bend, OR",1212,3230,170,116,1992-09-01,1614266,83955,45.58869934,-122.5979996,44.2541008,-121.1500015,0,1992,9
9,SFO,RDM,"San Francisco, CA","Bend, OR",1000,1650,55,462,1993-04-01,7635582,87688,37.6189994812012,-122.375,44.2541008,-121.1500015,1,1993,4


#### Transform Origin_airport/Destination_airport into numbers

In [17]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline

indexerOriginAirport = StringIndexer(inputCol="Origin_airport", outputCol="Origin_airport_index").setHandleInvalid("keep")
indexerDestinationAirport = StringIndexer(inputCol="Destination_airport", outputCol="Destination_airport_index").setHandleInvalid("keep")

onehot = OneHotEncoder(inputCols=[indexerOriginAirport.getOutputCol(), indexerOriginAirport.getOutputCol()],
                                 outputCols=['OriginAirport_enc', 'DestimationAirport_enc']).setHandleInvalid("keep")

# pipeline = Pipeline(stages=[indexerSourceLanguage, indexerTargetLanguage])
pipeline = Pipeline(stages=[indexerOriginAirport, indexerDestinationAirport, onehot])

In [18]:
df1 = pipeline.fit(df1).transform(df1)

In [19]:
show(df1, 5)

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,...,Org_airport_long,Dest_airport_lat,Dest_airport_long,Profitable,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc
0,PDX,RDM,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,...,-122.5979996,44.2541008,-121.1500015,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,LMT,RDM,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,...,-121.733001708984,44.2541008,-121.1500015,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,SFO,RDM,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,...,-122.375,44.2541008,-121.1500015,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,EUG,RDM,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,...,-123.21199798584,44.2541008,-121.1500015,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,SFO,RDM,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,...,-122.375,44.2541008,-121.1500015,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [20]:
df2 = df1.drop('Origin_airport', 'Destination_airport')
show(df2, 5)

Unnamed: 0,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long,Profitable,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc
0,"Portland, OR","Bend, OR",523,962,26,116,1990-07-01,1534762,76034,45.58869934,-122.5979996,44.2541008,-121.1500015,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"Klamath Falls, OR","Bend, OR",176,925,25,147,1990-03-01,57948,76034,42.1561012268066,-121.733001708984,44.2541008,-121.1500015,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"San Francisco, CA","Bend, OR",1080,2490,83,462,1990-02-01,7436126,76034,37.6189994812012,-122.375,44.2541008,-121.1500015,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"Eugene, OR","Bend, OR",68,414,23,103,1991-05-01,287600,80333,44.1245994567871,-123.21199798584,44.2541008,-121.1500015,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"San Francisco, CA","Bend, OR",1013,2700,90,462,1991-11-01,7498756,80333,37.6189994812012,-122.375,44.2541008,-121.1500015,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Feature Selection

### Drop some irelevant columns

In [21]:
#Drop also City's informations
df3 = df2.drop('Origin_city', 'Destination_city', 'Distance')
df3 = df3.drop('Org_airport_lat','Org_airport_long', 'Dest_airport_lat', 'Dest_airport_long')
df3 = df3.drop('Origin_population', 'Destination_population')

#Drop fly_date
df3 = df3.drop('Fly_date')

#Drop Passengers since we will not know them at prediction time
df3 = df3.drop('Passengers')

show(df3, 5)

Unnamed: 0,Seats,Flights,Profitable,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc
0,962,26,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,925,25,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2490,83,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,414,23,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2700,90,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Name label column properly

In [22]:
df4 = df3.withColumnRenamed('Profitable', 'label')
show(df4, 5)

Unnamed: 0,Seats,Flights,label,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc
0,962,26,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,925,25,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2490,83,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,414,23,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2700,90,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Start assembling the features and scale them

In [23]:
#Spark ML’s algorithms expect the data to be represented in two columns: Features and Labels.
feature_columns =  df4.columns
feature_columns.remove('label') # here we omit the label column and the OneHotEnc values
feature_columns.remove('OriginAirport_enc') # We avoid scaling the OHE column
feature_columns.remove('DestimationAirport_enc')  # We avoid scaling the OHE column


from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features1")
df4 = assembler.transform(df4)

from pyspark.ml.feature import VectorAssembler
assemblerL = VectorAssembler(inputCols=['label'],outputCol="label1")
df4 = assemblerL.transform(df4)
show(df4, 5)

Unnamed: 0,Seats,Flights,label,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc,features1,label1
0,962,26,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[962.0, 26.0, 1990.0, 7.0, 34.0, 180.0]",[1.0]
1,925,25,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[925.0, 25.0, 1990.0, 3.0, 200.0, 180.0]",[0.0]
2,2490,83,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2490.0, 83.0, 1990.0, 2.0, 18.0, 180.0]",[1.0]
3,414,23,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[414.0, 23.0, 1991.0, 5.0, 139.0, 180.0]",[0.0]
4,2700,90,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2700.0, 90.0, 1991.0, 11.0, 18.0, 180.0]",[0.0]


In [24]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features1", outputCol="features2", withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df4)
# Normalize each feature to have unit standard deviation.
df4 = scalerModel.transform(df4)
show(df4, 5)

Unnamed: 0,Seats,Flights,label,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc,features1,label1,features2
0,962,26,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[962.0, 26.0, 1990.0, 7.0, 34.0, 180.0]",[1.0],"[0.15424420377451695, 0.5228241049165842, 346...."
1,925,25,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[925.0, 25.0, 1990.0, 3.0, 200.0, 180.0]",[0.0],"[0.148311734398574, 0.5027154854967155, 346.23..."
2,2490,83,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2490.0, 83.0, 1990.0, 2.0, 18.0, 180.0]",[1.0],"[0.39923915529994514, 1.6690154118490954, 346...."
3,414,23,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[414.0, 23.0, 1991.0, 5.0, 139.0, 180.0]",[0.0],"[0.0663795222064969, 0.46249824665697825, 346...."
4,2700,90,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2700.0, 90.0, 1991.0, 11.0, 18.0, 180.0]",[0.0],"[0.43290992743367546, 1.8097757477881757, 346...."


In [25]:
## Add the two other columns to the features
final_feature_columns = ['features2', 'OriginAirport_enc', 'DestimationAirport_enc']

from pyspark.ml.feature import VectorAssembler
assemblerF = VectorAssembler(inputCols=final_feature_columns,outputCol="features")
df5 = assemblerF.transform(df4)

show(df5, 5)

Unnamed: 0,Seats,Flights,label,year,month,Origin_airport_index,Destination_airport_index,OriginAirport_enc,DestimationAirport_enc,features1,label1,features2,features
0,962,26,1,1990,7,34.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[962.0, 26.0, 1990.0, 7.0, 34.0, 180.0]",[1.0],"[0.15424420377451695, 0.5228241049165842, 346....","(0.15424420377451695, 0.5228241049165842, 346...."
1,925,25,0,1990,3,200.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[925.0, 25.0, 1990.0, 3.0, 200.0, 180.0]",[0.0],"[0.148311734398574, 0.5027154854967155, 346.23...","(0.148311734398574, 0.5027154854967155, 346.23..."
2,2490,83,1,1990,2,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2490.0, 83.0, 1990.0, 2.0, 18.0, 180.0]",[1.0],"[0.39923915529994514, 1.6690154118490954, 346....","(0.39923915529994514, 1.6690154118490954, 346...."
3,414,23,0,1991,5,139.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[414.0, 23.0, 1991.0, 5.0, 139.0, 180.0]",[0.0],"[0.0663795222064969, 0.46249824665697825, 346....","(0.0663795222064969, 0.46249824665697825, 346...."
4,2700,90,0,1991,11,18.0,180.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[2700.0, 90.0, 1991.0, 11.0, 18.0, 180.0]",[0.0],"[0.43290992743367546, 1.8097757477881757, 346....","(0.43290992743367546, 1.8097757477881757, 346...."


In [26]:
df6 = df5.select('label', 'features')
show(df6)

Unnamed: 0,label,features
0,1,"(0.15424420377451695, 0.5228241049165842, 346...."
1,0,"(0.148311734398574, 0.5027154854967155, 346.23..."
2,1,"(0.39923915529994514, 1.6690154118490954, 346...."
3,0,"(0.0663795222064969, 0.46249824665697825, 346...."
4,0,"(0.43290992743367546, 1.8097757477881757, 346...."
5,1,"(0.06349345602360573, 0.4423896272371096, 346...."
6,0,"(0.010421905660440335, 0.02010861941986862, 34..."
7,0,"(0.5331205587840633, 3.5190083984770086, 346.5..."
8,0,"(0.5178885428188044, 3.4184653013776654, 346.5..."
9,1,"(0.26455606676502386, 1.105974068092774, 346.7..."


# Split in training and test data

In [27]:
train, test = df6.randomSplit([0.7, 0.3], seed = 123)
#Caching the train df since we will do multiple iterations on it
train.cache()
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 126208
Test Dataset Count: 54260


In [28]:
train.groupby('label').agg({'label': 'count'}).show()
test.groupby('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|    1|       97768|
|    0|       28440|
+-----+------------+

+-----+------------+
|label|count(label)|
+-----+------------+
|    1|       41900|
|    0|       12360|
+-----+------------+



# Build the model

## Decision Tree

In [29]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import DecisionTreeClassificationModel
decision_tree = DecisionTreeClassifier(featuresCol="features", labelCol="label", maxDepth=4, seed=1234)
dtModel = decision_tree.fit(train)
predictions = dtModel.transform(test)

In [30]:
# test our model and make predictions using testing data
predictions = dtModel.transform(test)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test Error: %g " % (1.0 - accuracy))
print("Accuracy: %g " % accuracy)

# First Run results
# Test Error: 0.132916 
# Accuracy: 0.867084 

Test Error: 0.131994 
Accuracy: 0.868006 


In [31]:
print(dtModel.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2e4c6fd34d6f, depth=4, numNodes=13, numClasses=2, numFeatures=1026
  If (feature 0 <= 2.405055152409308E-4)
   If (feature 0 <= 8.016850508031027E-5)
    Predict: 0.0
   Else (feature 0 > 8.016850508031027E-5)
    If (feature 1 <= 0.0904887873894088)
     Predict: 0.0
    Else (feature 1 > 0.0904887873894088)
     Predict: 1.0
  Else (feature 0 > 2.405055152409308E-4)
   If (feature 0 <= 0.1546450462999185)
    If (feature 4 <= 3.3484464470924014)
     Predict: 1.0
    Else (feature 4 > 3.3484464470924014)
     If (feature 1 <= 0.33179222042783224)
      Predict: 1.0
     Else (feature 1 > 0.33179222042783224)
      Predict: 0.0
   Else (feature 0 > 0.1546450462999185)
    Predict: 1.0



In [32]:
show(predictions)

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,0,"(0.004810110304818616, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
1,0,"(0.004810110304818616, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
2,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
3,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
4,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
5,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
6,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
7,0,"(0.00897887256899475, 0.02010861941986862, 349...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
8,0,"(0.010582242670600955, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
9,0,"(0.010902916690922197, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0


In [33]:
show(predictions.filter(predictions.label!=predictions.prediction))

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,0,"(0.004810110304818616, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
1,0,"(0.004810110304818616, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
2,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
3,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
4,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
5,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
6,0,"(0.008016850508031026, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
7,0,"(0.00897887256899475, 0.02010861941986862, 349...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
8,0,"(0.010582242670600955, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0
9,0,"(0.010902916690922197, 0.02010861941986862, 34...","[8756.0, 30335.0]","[0.22399017676703079, 0.7760098232329692]",1.0


## Save and reload the model

In [34]:
# Save and load model
# modelFileName = "target/myModel_v_0_1"
# dtModel.save(modelFileName)
# sameModel = DecisionTreeClassificationModel.load(modelFileName)

## Random Forest Classifier

In [35]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', numTrees=1, maxDepth=1)
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

In [36]:
evaluation_summary = rfModel.evaluate(test)
print('Accuracy:', evaluation_summary.accuracy)
# Precision quantifies the number of positive class predictions that actually belong to the positive class. 
# Recall quantifies the number of positive class predictions made out of all positive examples in the dataset. 
# F-Measure provides a single score that balances both the concerns of precision and recall in one number.
print('Precision:',evaluation_summary.precisionByLabel)
print('Recall:', evaluation_summary.recallByLabel)
print("F1 score:", evaluation_summary.fMeasureByLabel())

Accuracy: 0.8671028381865094
Precision: [0.9998058629392351, 0.8531837341424179]
Recall: [0.4166666666666667, 0.9999761336515514]
F1 score: [0.5882017017874479, 0.9207660780801898]


In [37]:
show(predictions.filter(predictions.label!=predictions.prediction))

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,0,"(0.004810110304818616, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
1,0,"(0.004810110304818616, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
2,0,"(0.008016850508031026, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
3,0,"(0.008016850508031026, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
4,0,"(0.008016850508031026, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
5,0,"(0.008016850508031026, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
6,0,"(0.008016850508031026, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
7,0,"(0.00897887256899475, 0.02010861941986862, 349...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
8,0,"(0.010582242670600955, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0
9,0,"(0.010902916690922197, 0.02010861941986862, 34...","[0.14546378742200097, 0.854536212577999]","[0.14546378742200097, 0.854536212577999]",1.0


In [38]:
show(predictions.filter(predictions.prediction==0))

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,0,"(0.00016033701016062053, 0.02010861941986862, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
1,0,"(0.00016033701016062053, 0.02010861941986862, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
2,0,"(0.00032067402032124107, 0.04021723883973724, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
3,0,"(0.00016033701016062053, 0.02010861941986862, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
4,0,"(0.00032067402032124107, 0.04021723883973724, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
5,0,"(0.00032067402032124107, 0.04021723883973724, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
6,0,"(0.00016033701016062053, 0.02010861941986862, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
7,0,"(0.00016033701016062053, 0.02010861941986862, ...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
8,0,"(0.0, 0.02010861941986862, 347.62835071732115,...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0
9,0,"(0.0, 0.02010861941986862, 347.80233888084337,...","[0.9994936281542747, 0.0005063718457253777]","[0.9994936281542747, 0.0005063718457253777]",0.0


## Gradient-Boosted Tree Classifier

In [39]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % accuracy)

Test Error = 0.129819 
Accuracy = 0.870181 


In [41]:
show(predictions.filter(predictions.label==1))

Unnamed: 0,label,features,rawPrediction,probability,prediction
0,1,"(0.004810110304818616, 0.02010861941986862, 34...","[-0.537855921002071, 0.537855921002071]","[0.2543183681674676, 0.7456816318325323]",1.0
1,1,"(0.004810110304818616, 0.02010861941986862, 34...","[-0.483434467513656, 0.483434467513656]","[0.2755050295709874, 0.7244949704290127]",1.0
2,1,"(0.004810110304818616, 0.02010861941986862, 34...","[-0.5006025290880693, 0.5006025290880693]","[0.2687045585328382, 0.7312954414671617]",1.0
3,1,"(0.004810110304818616, 0.02010861941986862, 34...","[-0.483434467513656, 0.483434467513656]","[0.2755050295709874, 0.7244949704290127]",1.0
4,1,"(0.00593246937594296, 0.02010861941986862, 347...","[-0.537855921002071, 0.537855921002071]","[0.2543183681674676, 0.7456816318325323]",1.0
5,1,"(0.00593246937594296, 0.02010861941986862, 348...","[-0.6274602371628, 0.6274602371628]","[0.22184954347357105, 0.778150456526429]",1.0
6,1,"(0.006413480406424822, 0.02010861941986862, 34...","[-0.5006025290880693, 0.5006025290880693]","[0.2687045585328382, 0.7312954414671617]",1.0
7,1,"(0.006413480406424822, 0.02010861941986862, 34...","[-0.5006025290880693, 0.5006025290880693]","[0.2687045585328382, 0.7312954414671617]",1.0
8,1,"(0.0073755024673885445, 0.02010861941986862, 3...","[-0.537855921002071, 0.537855921002071]","[0.2543183681674676, 0.7456816318325323]",1.0
9,1,"(0.008016850508031026, 0.02010861941986862, 34...","[-0.537855921002071, 0.537855921002071]","[0.2543183681674676, 0.7456816318325323]",1.0


# Individual work

1. Add the Origin_city, Destination_city, Distance in the features and see if the accuracy of the model increses
2. Add the Origin_population, Destination_population in the features and see if the accuracy of the model increses