In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [5]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## WARNING, FILE "delay_clean.csv" is > 4 GB  --  added to gitignore
## Using reduced dataset.....

In [7]:
# Load and parse the data file, converting it to a DataFrame
clean = spark.read.format("libsvm").load('/content/drive/MyDrive/Colab_Notebooks/delay_clean2K_SVM.txt')
clean.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [8]:
clean.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [9]:
# Number of rows in dataset
number_rows = clean.count()
number_rows

200000

In [10]:
clean.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|164920|
|  1.0| 35080|
+-----+------+



In [11]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

In [12]:
# Automatically identify categorical features, and index them
# Set maxCategories so features with > 4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4).fit(clean)

In [13]:
from pyspark.ml.feature import Normalizer

In [14]:
normalizer = Normalizer(inputCol = "features", outputCol = "normFeatures", p = 1.0)
NormOutput = normalizer.transform(clean)

In [15]:
# Split the data into training and test sets
(trainingData, testData) = clean.randomSplit([0.75, 0.25])

In [16]:
trainingData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [17]:
testData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



# Gradient-boosted tree classifier (GBT)

In [73]:
# Train a GBT model
gbt = GBTClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", maxIter = 30, maxDepth = 10,
                    stepSize = 1)

In [74]:
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, gbt])

In [75]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [76]:
# Make predictions
predictions = model.transform(testData)

In [77]:
# Select example rows to display
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [78]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.80939
Test Error = 0.19061


In [79]:
gbtModel = model.stages[2]
print(gbtModel)  # summary only

GBTClassificationModel: uid = GBTClassifier_23d4fd94c4a9, numTrees=30, numClasses=2, numFeatures=137


In [80]:
y_true = predictions.select(['indexedLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

In [81]:
print(confusion_matrix(y_true, y_pred))

[[38048  2982]
 [ 6526  2326]]


In [82]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89     41030
         1.0       0.44      0.26      0.33      8852

    accuracy                           0.81     49882
   macro avg       0.65      0.60      0.61     49882
weighted avg       0.78      0.81      0.79     49882



In [83]:
stop

NameError: ignored

In [None]:
importanceSummary = gbtModel.featureImportances
importanceSummary

In [None]:
gbt.save("gbt_model.model")

# Random forest classifier (RFC)

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString

In [None]:
# Train a RandomForest model
rf = RandomForestClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", numTrees = 10)

In [None]:
# Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol = "prediction", outputCol = "predictedLabel",
                               labels = labelIndexer.labels)

In [None]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, rf, labelConverter])

In [None]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [None]:
# Make predictions
predictions1 = model.transform(testData)

In [None]:
# Select example rows to display
predictions1.select("predictedLabel", "label", "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions1)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
rfModel = model.stages[2]
print(rfModel)  # summary only

In [None]:
print(rfModel.featureImportances)

In [None]:
y_true = predictions1.select(['indexedLabel']).collect()
y_pred = predictions1.select(['prediction']).collect()

In [None]:
print(confusion_matrix(y_true, y_pred))

In [None]:
print(classification_report(y_true, y_pred))

# Factorization machines classifier

In [None]:
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler

In [None]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

# Scale features
featureScaler = MinMaxScaler(inputCol = "features", outputCol = "scaledFeatures").fit(clean) 

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [None]:
# Train a FM model
fm = FMClassifier(labelCol = "indexedLabel", featuresCol = "scaledFeatures", stepSize = 0.001)

In [None]:
# Create a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

In [None]:
# Train model
model = pipeline.fit(trainingData)

In [None]:
# Make predictions
predictions2 = model.transform(testData)

In [None]:
# Select example rows to display
predictions2.select("prediction", "indexedLabel", "features").show(5)

In [None]:
# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions2)
print("Test set accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
y_true = predictions2.select(['indexedLabel']).collect()
y_pred = predictions2.select(['prediction']).collect()

In [None]:
print(confusion_matrix(y_true, y_pred))

In [None]:
print(classification_report(y_true, y_pred))