In [1]:
import pyspark
from pyspark.sql import SparkSession

import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.master('local[*]').appName('flights').getOrCreate()

# Read data from CSV file
flights = spark.read.csv('./flights.csv', sep=',', header=True, inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# View the first five records
flights.show(5)

# Check column data types
print(flights.printSchema())
print(flights.dtypes)

The data contain 50000 records.
+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: integer (nullable = true)

None
[('mon', 'int'), 

In [3]:
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression

# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoder(
    inputCols=['org_idx', 'dow'],
    outputCols=['org_dummy', 'dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

In [5]:
from pyspark.ml import Pipeline

flights_train, flights_test = flights.randomSplit([0.8, 0.2])

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the test data
predictions = pipeline.transform(flights_test)

In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Read data from CSV file
sms = spark.read.csv('./sms.csv', sep=';', header=False, schema=schema, nullValue='NA')

sms.show(5)

+---+--------------------+-----+
| id|                text|label|
+---+--------------------+-----+
|  1|Sorry, I'll call ...|    0|
|  2|Dont worry. I gue...|    0|
|  3|Call FREEPHONE 08...|    1|
|  4|Win a 1000 cash p...|    1|
|  5|Go until jurong p...|    0|
+---+--------------------+-----+
only showing top 5 rows



In [7]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol='hash')
idf = IDF(inputCol=hasher.getOutputCol(), outputCol='features')

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

In [8]:
assembler = VectorAssembler(inputCols=['km'], outputCol='features')

flights = assembler.transform(flights.drop('features'))

flights.show(5)

+---+---+---+-------+------+---+------+--------+-----+------+--------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|    km|features|
+---+---+---+-------+------+---+------+--------+-----+------+--------+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| NULL|3465.0|[3465.0]|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30| 509.0| [509.0]|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8| 542.0| [542.0]|
|  9| 13|  1|     AA|   419|ORD| 10.33|     195|   -5|1989.0|[1989.0]|
|  4|  2|  5|     AA|   325|ORD|  8.92|      65| NULL| 415.0| [415.0]|
+---+---+---+-------+------+---+------+--------+-----+------+--------+
only showing top 5 rows



In [10]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

flights_train, flights_test = flights.randomSplit([0.8, 0.2])

# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
cv = CrossValidator(estimator=regression, estimatorParamMaps=params, 
                    evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)
cv
# NOTE: Since cross-valdiation builds multiple models, the fit() method can take a little while to complete.

CrossValidatorModel_58be9d4e89bb

In [11]:
# Create an empty paramter grid
params = ParamGridBuilder().build()

# Create regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoder(inputCol='org_idx', outputCol='org_dummy')

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km', 'org_dummy'], outputCol='features')

# Create a pipeline and cross-validator
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator)

In [None]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km', 'org_dummy'], outputCol='features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=params,
                    evaluator=evaluator)

In [12]:
# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0])\
               .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, 
                    evaluator=evaluator, numFolds=5)

Number of models to be tested:  12


In [18]:
# Drop the existed feature column
flights_train, flights_test = flights.drop('features').randomSplit([0.8, 0.2])

# Train the data
cvModel = cv.fit(flights_train)

# Get the best model from cross validation
best_model = cvModel.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on test data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
print("RMSE =", evaluator.evaluate(predictions))

[StringIndexerModel: uid=StringIndexer_57a57fa3f95a, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_9c94b567ac87, dropLast=true, handleInvalid=error, VectorAssembler_46e3f09e608a, LinearRegressionModel: uid=LinearRegression_5c48066dcda7, numFeatures=8]
RMSE = 11.315301114936847


In [19]:
# # Get the best model from cross validation
# best_model = cv.bestModel

# # Look at the stages in the best model
# print(best_model.stages)

# # Get the parameters for the LinearRegression object in the best model
# best_model.stages[3].extractParamMap()

# # Generate predictions on testing data using the best model then calculate RMSE
# predictions = best_model.transform(flights_test)
# print("RMSE =", evaluator.evaluate(predictions))

In [20]:
# Create parameter grid
params = ParamGridBuilder()

# Add grid for hashing trick parameters
params = params.addGrid(hasher.numFeatures, (1024, 4096, 16384))\
               .addGrid(hasher.binary, (True, False))

# Add grid for logistic regression parameters
params = params.addGrid(logistic.regParam, (0.01, 0.1, 1.0, 10.0))\
               .addGrid(logistic.elasticNetParam, (0.0, 0.5, 1.0))

# Build parameter grid
params = params.build()

print('Number of models to be tested: ', len(params))

Number of models to be tested:  72


In [21]:
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

assembler = VectorAssembler(inputCols=['mon', 'depart', 'duration'], outputCol='features')
flights = assembler.transform(flights.drop('features'))
flights = flights.withColumn('label', (flights.delay >= 15).cast('integer'))
flights = flights.select('mon', 'depart', 'duration', 'features', 'label')
flights = flights.dropna()

flights.show(5)

+---+------+--------+-----------------+-----+
|mon|depart|duration|         features|label|
+---+------+--------+-----------------+-----+
|  0| 16.33|      82| [0.0,16.33,82.0]|    1|
|  2|  6.17|      82|  [2.0,6.17,82.0]|    0|
|  9| 10.33|     195|[9.0,10.33,195.0]|    0|
|  5|  7.98|     102| [5.0,7.98,102.0]|    0|
|  7| 10.83|     135|[7.0,10.83,135.0]|    1|
+---+------+--------+-----------------+-----+
only showing top 5 rows



In [22]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pprint import pprint

flights_train, flights_test = flights.randomSplit([0.8, 0.2])

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on test data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
pprint(gbt.trees)
print(gbt.featureImportances)

[DecisionTreeRegressionModel: uid=dtr_46cc5d298f73, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_94e6bacd12f1, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_a229ec41c4a7, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_666f136ac103, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_8a867e6cd431, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_0e275e0b9a66, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_66fd33f9643d, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_b1859b1e2227, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_f0342ab5385d, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_fd925bf00459, depth=5, numNodes=63, numFeatures=3,
 DecisionTreeRegressionModel: uid=dtr_b8429acbe13a, depth=5, numNodes=59, numFeatures=3,
 DecisionTreeRegressi

In [23]:
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
print(evaluator.evaluate(tree.transform(flights_test)))
print(evaluator.evaluate(gbt.transform(flights_test)))

# Find the number of trees and the relative importance of features
print(gbt.getNumTrees)
print(gbt.featureImportances)

0.6317135994213321
0.6715623984909723
20
(3,[0,1,2],[0.3620215021193922,0.31479204980678727,0.32318644807382035])


In [24]:
from pyspark.ml.classification import RandomForestClassifier

# Create a random forest classifier
forest = RandomForestClassifier()

# Create a parameter grid
params = ParamGridBuilder() \
        .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
        .addGrid(forest.maxDepth, [2, 5, 10]) \
        .build()

# Create a binary classification evaluator
evaluator = BinaryClassificationEvaluator()

# Create a cross-validator
cv = CrossValidator(estimator=forest, estimatorParamMaps=params, 
                    evaluator=evaluator, numFolds=5)

In [27]:
cv

CrossValidator_12f77b69ac92

In [28]:
cv = cv.fit(flights_train)
# Average AUC for each parameter combination in grid
print(cv.avgMetrics)

# Average AUC for the best model
print(max(cv.avgMetrics))

# What's the optimal parameter value for maxDepth?
print(cv.bestModel.explainParam('maxDepth'))
# What's the optimal parameter value for featureSubsetStrategy?
print(cv.bestModel.explainParam('featureSubsetStrategy'))

# AUC for best model on testing data
print(evaluator.evaluate(cv.transform(flights_test)))

[0.6192442309867001, 0.660902695918036, 0.6707696192701326, 0.6406393858684399, 0.6648784750709386, 0.6756797541227204, 0.6419117412289781, 0.6637761737910489, 0.6734437206206086, 0.6419117412289781, 0.6637761737910489, 0.6734437206206086]
0.6756797541227204
maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30]. (default: 5, current: 10)
featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto' (def

In [26]:
cvModel = cv.fit(flights_train)

# Average AUC for each parameter combination in grid
avg_auc = cvModel.avgMetrics

# Average AUC for the best model
best_model_auc = max(avg_auc)

# What's the optimal paramter value?
opt_max_depth = cvModel.bestModel.explainParam('maxDepth')
opt_feat_substrat = cvModel.bestModel.explainParam('featureSubsetStrategy')

# AUC for best model on test data
best_auc = evaluator.evaluate(cvModel.transform(flights_test))
print(best_auc)

0.6726534961105283
