In [None]:
# PySpark 
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
# functions
from functions import *
# ML
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StandardScaler


In [None]:
json_folder = ".../datasets/CM-Buildings/*.json"
label_csv = ".../datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = True
level = 4
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }

In [None]:
conf = SparkConf().setAppName("spark").setMaster("local[*]").set("spark.driver.memory", "15g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
# spark.sparkContext.addPyFile("functions.py") this may be used in cloud environment

In [None]:
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)

In [None]:
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))

In [None]:
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))

In [None]:
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))

In [None]:
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))

In [None]:
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}

In [None]:
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))

In [None]:
# Convert to dataframe from rdd
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")

In [None]:
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)

In [None]:
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))

In [None]:
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")

In [None]:
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)

In [None]:
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())

In [None]:
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
coef = cvModel.bestModel.stages[0].coefficients
coef

In [None]:
most_important_features(coef.toArray(), reverse_index_map, True)

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
gbt = GBTClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
lr = LogisticRegression(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[lr])
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [5,10]).addGrid(lr.regParam, [0.2,0.3,0.4]).addGrid(lr.elasticNetParam, [0.6,0.7,0.8]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"),
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))