In [0]:
!pip install pyspark
!pip install cytoolz

Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[?25l[K     |                                | 10 kB 21.1 MB/s eta 0:00:14[K     |                                | 20 kB 19.5 MB/s eta 0:00:15[K     |                                | 30 kB 25.9 MB/s eta 0:00:11[K     |                                | 40 kB 13.7 MB/s eta 0:00:21[K     |                                | 51 kB 16.1 MB/s eta 0:00:18[K     |                                | 61 kB 18.6 MB/s eta 0:00:16[K     |                                | 71 kB 16.7 MB/s eta 0:00:17[K     |                                | 81 kB 18.2 MB/s eta 0:00:16[K     |                                | 92 kB 16.6 MB/s eta 0:00:17[K     |                                | 102 kB 17.1 MB/s eta 0:00:17[K     |                                | 112 kB 17.1 MB/s eta 0:00:17[K     |                                | 122 kB 17.1 MB/s eta 0:00:17[K     |                                | 133 kB 17.1 MB/s eta 0:00:17

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import *
import typing as T
import cytoolz.curried as tz
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
import pyspark
import os


In [0]:
spark = SparkSession.builder.appName("Ethan_Daniel").getOrCreate()
spark.conf.set("spark.sql.caseSensitive", "true")
prefix = "/mnt/lab94290/inputdata/part2/"
filelist = [ prefix + f"requestLog_C_92004{i}.log.tar_0.json" for i in range(0,10)]
fname_ref = "bids_12.json"
inferred = spark.read.json("/mnt/lab94290/inputdata/"+fname_ref)
data = spark.read.json(filelist)
data_test = spark.read.json(prefix + 'unlabled_bids.json')

In [0]:
data.printSchema()

root
 |-- app: struct (nullable = true)
 |    |-- bundle: string (nullable = true)
 |    |-- cat: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- content: struct (nullable = true)
 |    |    |-- cat: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- context: long (nullable = true)
 |    |    |-- embeddable: long (nullable = true)
 |    |    |-- episode: long (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- len: long (nullable = true)
 |    |    |-- livestream: long (nullable = true)
 |    |    |-- qagmediarating: long (nullable = true)
 |    |    |-- sourcerelationship: long (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- userrating: string (nullable = true)
 |    |    |-- videoquality: long (nullable = true)
 |    |-- domain: string (nullable = true)
 |    |-- ext: struct (nullable = true)
 |    |    |-- apilevel: long (nullable = true)


In [0]:
from pyspark.sql.functions import col

num_true_rows = data.where(col('fake') == True)
num_true_rows.count()

Out[118]: 20203

Preprocessing :

In [0]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import to_timestamp
from pyspark.ml.feature import Imputer
from pyspark.sql.functions import when

def pre_process(data, a=False):
    if a == False:
        data = data.select(data["id"],
                           data["device.geo.lat"].alias("latitude"),
                           data["location.countryCode"].alias("country"),
                           data["device.geo.lon"].alias("longitude"),
                           data["device.connectiontype"].alias("connectionType"),
                           data["device.devicetype"].alias("devicetype"),
                           data["imp.position"].alias('imp_pos'),
                           data["imp.bidfloor"].alias('bidfloor'),
                           data["device.os"].alias("os"),
                           data["device.lmt"].alias("lmt"),
                           data["device.dnt"].alias("dnt"),
                           data["fake"],
                           )
        data = data.select('id',
                           F.col('bidfloor')[0].alias('bidfloor'),
                           F.col('imp_pos')[0].alias('imp_pos'),
                                'latitude',
                                'longitude',
                                'connectionType',
                                'devicetype',
                                'country',
                                'os',
                                'lmt',
                                'dnt',
                                 'fake',
                          ).select('*')     
        data = data.withColumn("new_fake", when(data["fake"] == True, 1).otherwise(0))
        data = data.drop("fake").withColumnRenamed("new_fake", "fake")   
        
    if a == True:
        data = data.select(data["id"],
                           data["device.geo.lat"].alias("latitude"),
                           data["location.countryCode"].alias("country"),
                           data["device.geo.lon"].alias("longitude"),
                           data["device.connectiontype"].alias("connectionType"),
                           data["device.devicetype"].alias("devicetype"),
                           data["imp.position"].alias('imp_pos'),
                           data["imp.bidfloor"].alias('bidfloor'),
                           data["device.os"].alias("os"),
                           data["device.lmt"].alias("lmt"),
                           data["device.dnt"].alias("dnt"),
                           )
        data = data.select('id',
                           F.col('bidfloor')[0].alias('bidfloor'),
                           F.col('imp_pos')[0].alias('imp_pos'),
                                'latitude',
                                'longitude',
                                'connectionType',
                                'devicetype',
                                'country',
                                'os',
                                'lmt',
                                'dnt',
                          ).select('*') 

    indexer_country = StringIndexer(inputCol='country', outputCol='country_id').setHandleInvalid("keep").fit(data)
    indexed_df1 = indexer_country.transform(data)
    data = indexed_df1.drop('country')
    
    indexer_language = StringIndexer(inputCol='os', outputCol='os_id').setHandleInvalid("keep").fit(data)
    indexed_df2 = indexer_language.transform(data)
    data = indexed_df2.drop('os')

    return data

In [0]:
dataset = pre_process(data)
dataset_test = pre_process(data_test, a=True)

In [0]:
from pyspark.sql.types import StructType


def schema_to_columns(schema: pyspark.sql.types.StructType) -> T.List[T.List[str]]:
    """
    Produce a flat list of column specs from a possibly nested DataFrame schema
    """

    columns = list()

    def helper(schm: pyspark.sql.types.StructType, prefix: list = None):

        if prefix is None:
            prefix = list()

        for item in schm.fields:
            if isinstance(item.dataType, pyspark.sql.types.StructType):
                helper(item.dataType, prefix + [item.name])
            else:
                columns.append(prefix + [item.name])

    helper(schema)

    return columns


In [0]:
def flatten_frame(frame: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:

    aliased_columns = list()

    for col_spec in schema_to_columns(frame.schema):
        c = tz.get_in(col_spec, frame)
        if len(col_spec) == 1:
            aliased_columns.append(c)
        else:
            aliased_columns.append(c.alias('_'.join(col_spec)))

    return frame.select(aliased_columns)

In [0]:
dataset = flatten_frame(dataset)
dataset_test = flatten_frame(dataset_test)

Here we decided to split the dataset in two different parts before the imputation. Indeed, the classification model we are using could encounter a problem because the labelled dataset is heavily imbalanced that may affect our model's performance. After the imputation, we will merge the two datasets back together in order to perform our algorithm on it.

In [0]:
true_data = dataset.filter(dataset["fake"]==1)
false_data = dataset.filter(dataset["fake"]==0)

In [0]:
print('Lenght of the dataset where "fake" is true :', true_data.count())
print('Lenght of the dataset where "fake" is false :', false_data.count())

Lenght of the dataset where "fake" is true : 20203
Lenght of the dataset where "fake" is false : 3990119


Imputation :

In [0]:
def imputation(data):

    # For categorical features, we replaced  nan by "missing value".
    data = data.na.fill({'country_id': 'missing value'})
    data = data.na.fill({'os_id': 'missing value'})
    
    # For float/double features, we replaced the missing values with the mean of the feature. 
    mean_imputer = Imputer(inputCols=data.select('bidfloor','latitude','longitude').columns, outputCols=['bidfloor',
                                     'latitude', 'longitude'],
                  strategy="mean")

    # For integer features, we replaced the missing values with the most common value of the feature.
    most_commun_imputer = Imputer(inputCols=data.select('imp_pos','connectionType', 'devicetype','dnt','lmt').columns, outputCols=['imp_pos',
                                     'connectionType', 'devicetype','dnt','lmt'],
                  strategy="mode")

    data = mean_imputer.fit(data).transform(data)
    data = most_commun_imputer.fit(data).transform(data)

    return data

In [0]:
true_data = imputation(true_data)
false_data = false_data.na.drop()
dataset_test = imputation(dataset_test)

Randomizing :

Here we randomized the data because it helps to reduce bias in the dataset and prevent the model from learning the specific order of the examples. Randomizing the data ensures that each example has an equal chance of being selected during the training process, making it less likely that the model will be biased towards any particular subset of the data. Additionally, randomizing the data helps to improve the model's ability to generalize to new data by ensuring that the model does not overfit to any particular pattern or sequence in the training set.

In [0]:
from pyspark.sql.functions import rand

all_data = true_data.union(false_data)
all_data = all_data.orderBy(rand())


Question 1 – Spark supervised learning

For this part we choose to use the Random Forest algorithm.
Random Forest is an ensemble learning algorithm that combines multiple decision trees to make a prediction. The algorithm works by creating multiple decision trees using bootstrapped samples of the original training data, and then aggregating their predictions to make a final prediction.

In each tree, the algorithm selects a random subset of the features and uses them to determine the best split point. The split with the highest information gain is selected as the root node, and the process is repeated for each of the child nodes until all leaves contain only instances from one class.

Once all trees have been generated, the final prediction is made by taking a majority vote among the predictions of all the trees. If the trees are well-built, the errors of the individual trees will be random and uncorrelated, leading to an improvement in accuracy compared to a single decision tree.

Moreover, there are several reasons why we choose to use the Random Forest algorithm for our problem:

 - High accuracy: Random Forest is a highly accurate algorithm, often outperforming other popular algorithms such as decision trees and support vector machines.

 - Robust to Overfitting: The random selection of features in each tree helps prevent overfitting and makes the algorithm more robust to noisy or irrelevant data.

 - Easy to use: Random Forest is relatively easy to implement and can be quickly trained even on large datasets.

 - Feature Importance: Random Forest can be used to determine the importance of each feature in the prediction, making it useful for feature selection and understanding the relationships between the features and the target.

 - Works well with Non-linear Relationships: Random Forest is capable of modeling complex, non-linear relationships between features and the target, making it a versatile algorithm for a variety of use cases.

Overall, Random Forest is a powerful and flexible algorithm that can be applied to a wide range of supervised learning problems.

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
assembler = VectorAssembler(inputCols=['bidfloor', 'imp_pos', 'latitude', 'longitude', 'connectionType', 'devicetype', 'lmt', 'dnt', 'country_id', 'os_id'], outputCol="features")
data_assembler = assembler.transform(all_data).select('id', 'features', 'fake')
data_assembler = data_assembler.dropDuplicates(['id'])
testset_assembler = assembler.transform(dataset_test).select('id', 'features')
testset_assembler = testset_assembler.dropDuplicates(['id'])

In [0]:
data_assembler.show()

+--------------------+--------------------+----+
|                  id|            features|fake|
+--------------------+--------------------+----+
|000023b0-0e30-254...|[0.72579,0.0,25.2...|   0|
|0000269e-fee5-4d4...|[0.431507,0.0,-29...|   0|
|00005803-e6dc-394...|[2.01500999999999...|   0|
|00005d8e-2a69-8a4...|[0.31075,0.0,35.6...|   0|
|000068f9-894a-084...|[0.49872,0.0,26.5...|   0|
|00009af8-4d0a-014...|[0.29589,1.0,35.6...|   0|
|0000a8f8-f1e2-034...|[0.54,1.0,24.3569...|   0|
|0000be32-2277-3b4...|[0.136,0.0,-26.38...|   0|
|0000c7c5-6693-b44...|[0.08519,0.0,24.4...|   0|
|0000cbf6-0aad-764...|[2.25,0.0,21.5169...|   0|
|0000f018-9c50-9c4...|[0.387671,0.0,25....|   0|
|0000fa62-14e5-6c4...|[0.83707,1.0,25.2...|   0|
|0001051d-142c-424...|[0.54,1.0,25.3268...|   0|
|00012a80-7e01-1a4...|[0.36188000000000...|   0|
|00013cbf-818d-1d4...|[0.09946999999999...|   0|
|000147de-4f45-3b4...|[0.233,0.0,25.258...|   0|
|000160d5-8d0e-4f4...|[0.183,1.0,-25.75...|   0|
|00017022-6967-be4..

In [0]:
from sklearn.metrics import confusion_matrix
from pyspark.mllib.evaluation import MulticlassMetrics

training_data, test_data = data_assembler.randomSplit([0.8, 0.2])

rf = RandomForestClassifier(labelCol="fake", featuresCol="features", numTrees=100,maxDepth=15)
model = rf.fit(training_data)


test_data_predictions = model.transform(test_data)
# test_data_predictions = test_data_predictions.withColumnRenamed("fake","label")
test_data_predictions = test_data_predictions.withColumn("label", col("fake").cast("float"))
test_data_predictions = test_data_predictions.drop("fake")

results = test_data_predictions.select(['prediction', 'label'])
predictionAndLabels=results.rdd
metrics = MulticlassMetrics(predictionAndLabels)



In [0]:
test_data_predictions.filter(test_data_predictions.prediction==1).count()

Out[18]: 4087

In [0]:
test_data_predictions.count()

Out[19]: 229143

In [0]:
metrics.confusionMatrix().toArray()

Out[20]: array([[2.25056e+05, 1.00000e+01],
       [0.00000e+00, 4.07700e+03]])

In [0]:
precision = metrics.precision(0.0)
recall = metrics.recall(0.0)
f1score = metrics.fMeasure(0.0, 1.0)
accuracy = metrics.accuracy
print("accuracy =",accuracy)
print("recall =",recall)
print("precision =",precision)
print("F1 score =",f1score)
print("Formula =", (5*precision*recall)/(4*precision+recall))


accuracy = 0.9999563591294519
recall = 0.9999555685887695
precision = 1.0
F1 score = 0.9999777838008362
Formula = 0.9999644545551488


In [0]:
# Extract the predictions into a new dataframe
prediction_df = test_data_predictions.select("label", "prediction")

# Count the number of anomalies in the prediction dataframe
num_anomalies = prediction_df.filter((prediction_df.label == "1")).count()

# Print the number of errors
print("Number of anomalies detected:", num_anomalies)

Number of anomalies detected: 4077


In [0]:
test_data_predictions1 = model.transform(testset_assembler)
# test_data_predictions = test_data_predictions.withColumnRenamed("fake","label")
preds1 = test_data_predictions1.filter(test_data_predictions1.prediction == 1).select('id').withColumnRenamed("id","anomaly")
preds1.write.csv("/mnt/lab94290/results/q1_345123624_931202543.csv")
preds1.count()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-3501086204430927>[0m in [0;36m<cell line: 4>[0;34m()[0m
[1;32m      2[0m [0;31m# test_data_predictions = test_data_predictions.withColumnRenamed("fake","label")[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0mpreds1[0m [0;34m=[0m [0mtest_data_predictions1[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0mtest_data_predictions1[0m[0;34m.[0m[0mprediction[0m [0;34m==[0m [0;36m1[0m[0;34m)[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0;34m'id'[0m[0;34m)[0m[0;34m.[0m[0mwithColumnRenamed[0m[0;34m([0m[0;34m"id"[0m[0;34m,[0m[0;34m"anomaly"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m [0mpreds1[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mcsv[0m[0;34m([0m[0;34m"/mnt/lab94290/results/q1_345123624_931202543.csv"[0m[0;34m)[0m[0;34m[0m[0;34m

In [0]:
test_data_predictions1.count()

Out[24]: 90857

Question 2

For this part, we choose to discuss the Gradient Boosted Tree (GBT) algorithm. GBT is a powerful ensemble learning algorithm that combines multiple decision trees to make a prediction. The algorithm works by iteratively adding decision trees to the model, with each tree improving upon the errors of the previous tree.

In each iteration, the algorithm calculates the negative gradient of the loss function with respect to the current prediction, and trains a decision tree to fit the residuals. The new tree is then added to the model, and the prediction for the current instance is updated by summing the predictions of all the trees in the model.

The GBT algorithm has several advantages that make it a popular choice for many machine learning tasks:

 - Improved Accuracy: GBT is a highly accurate algorithm that can outperform many other popular algorithms, such as Random Forest.

 - Robust to Overfitting: GBT uses a process called regularization to prevent overfitting, making it more robust to noisy or irrelevant data. This ensures that the model generalizes well to new, unseen data.

 - Easy to use: GBT is relatively easy to implement and can be quickly trained even on large datasets. Its flexible nature makes it a good choice for various types of problems.

 - Feature Importance: GBT can be used to determine the importance of each feature in the prediction, making it useful for feature selection and understanding the relationships between the features and the target. This feature can help us identify the most significant factors that affect the outcome.

 - Non-linear Relationships: GBT is capable of modeling complex, non-linear relationships between features and the target, making it a versatile algorithm for a variety of use cases. This means that it can capture non-linear interactions between variables and produce more accurate predictions.

In addition to these advantages, GBT has some unique features that make it particularly effective for certain types of problems. For example, GBT can handle missing data and can work with a variety of loss functions, making it well-suited for problems with imbalanced or noisy data.

Overall, GBT is a powerful and flexible algorithm that can be applied to a wide range of supervised learning problems. While it may not be the best choice for every problem, its high accuracy and flexibility make it an effective algorithm.

In [0]:
from pyspark.ml.classification import GBTClassifier 
training_data, test_data = data_assembler.randomSplit([0.8, 0.2])

gbtcc = GBTClassifier(labelCol="fake", maxIter=10, maxBins=100)
model = gbtcc.fit(training_data)

test_data_predictions2 = model.transform(test_data)
test_data_predictions2 = test_data_predictions2.withColumn("label", col("fake").cast("float"))
test_data_predictions2 = test_data_predictions2.drop("fake")

results = test_data_predictions2.select(['prediction', 'label'])
predictionAndLabels=results.rdd
metrics = MulticlassMetrics(predictionAndLabels)






In [0]:
test_data_predictions2.filter(test_data_predictions2.prediction==1).count()

Out[26]: 4228

In [0]:
testset_predictions2 = model.transform(testset_assembler)
# test_data_predictions = test_data_predictions.withColumnRenamed("fake","label")
preds2 = testset_predictions2.filter(testset_predictions2.prediction == 1).select('id').withColumnRenamed("id","anomaly")
preds2.write.csv("/mnt/lab94290/results/q2_345123624_931202543.csv")

print(preds2.count())

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-3501086204430931>[0m in [0;36m<cell line: 4>[0;34m()[0m
[1;32m      2[0m [0;31m# test_data_predictions = test_data_predictions.withColumnRenamed("fake","label")[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0mpreds2[0m [0;34m=[0m [0mtestset_predictions2[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0mtestset_predictions2[0m[0;34m.[0m[0mprediction[0m [0;34m==[0m [0;36m1[0m[0;34m)[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0;34m'id'[0m[0;34m)[0m[0;34m.[0m[0mwithColumnRenamed[0m[0;34m([0m[0;34m"id"[0m[0;34m,[0m[0;34m"anomaly"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 4[0;31m [0mpreds2[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mcsv[0m[0;34m([0m[0;34m"/mnt/lab94290/results/q2_345123624_931202543.csv"[0m[0;34m)[0m[0;34m[0m[0;34m[0m

In [0]:
precision = metrics.precision(0.0)
recall = metrics.recall(0.0)
f1score = metrics.fMeasure(0.0, 1.0)
accuracy = metrics.accuracy
print("accuracy =",accuracy)
print("recall =",recall)
print("precision =",precision)
print("F1 score =",f1score)
print("Formula =", (5*precision*recall)/(4*precision+recall))

accuracy = 0.9992803056738083
recall = 0.9992673211930675
precision = 1.0
F1 score = 0.9996335263427928
Formula = 0.9994137710509486
