In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [None]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## WARNING, FILE "delay_clean.csv" is > 4 GB  --  added to gitignore
## Using reduced dataset.....

In [None]:
# Load and parse the data file, converting it to a DataFrame
clean = spark.read.format("libsvm").load('/content/drive/MyDrive/Colab_Notebooks/delay_clean_SVM.txt')
clean.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [None]:
clean.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [None]:
# Number of rows in dataset
number_rows = clean.count()
number_rows

6489057

In [None]:
clean.groupBy('label').count().show()

+-----+-------+
|label|  count|
+-----+-------+
|  0.0|5261690|
|  1.0|1227367|
+-----+-------+



In [None]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

In [None]:
# Automatically identify categorical features, and index them
# Set maxCategories so features with > 4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4).fit(clean)

In [None]:
from pyspark.ml.feature import Normalizer

In [None]:
normalizer = Normalizer(inputCol = "features", outputCol = "normFeatures", p = 1.0)
NormOutput = normalizer.transform(clean)

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [None]:
trainingData.printSchema

<bound method DataFrame.printSchema of +-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 20 rows
>

In [None]:
testData.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
|  0.0|(153,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 10 rows



# Factorization machines classifier

In [None]:
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler

In [None]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

# Scale features
featureScaler = MinMaxScaler(inputCol = "features", outputCol = "scaledFeatures").fit(clean) 

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [None]:
# Train a FM model
fm = FMClassifier(labelCol = "indexedLabel", featuresCol = "scaledFeatures", stepSize = 0.001)

In [None]:
# Create a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

In [None]:
# Train model
model = pipeline.fit(trainingData)

In [None]:
# Make predictions
predictions2 = model.transform(testData)

In [None]:
# Select example rows to display
predictions2.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [None]:
# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions2)
print("Test set accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Test set accuracy = 0.810825
Test Error = 0.189175


In [None]:
y_true = predictions2.select(['indexedLabel']).collect()
y_pred = predictions2.select(['prediction']).collect()

In [None]:
print(confusion_matrix(y_true, y_pred))

[[1577956       0]
 [ 368156       0]]


In [None]:
print(classification_report(y_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90   1577956
         1.0       0.00      0.00      0.00    368156

    accuracy                           0.81   1946112
   macro avg       0.41      0.50      0.45   1946112
weighted avg       0.66      0.81      0.73   1946112



  _warn_prf(average, modifier, msg_start, len(result))
