In [1]:
# PySpark 
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
# functions
from functions import *
import time
# ML
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StandardScaler


In [2]:
conf = SparkConf().setAppName("spark").setMaster("local[*,20]").set("spark.driver.memory", "10g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"

22/04/06 01:06:37 WARN Utils: Your hostname, cuiyeshuaideMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.31.122 instead (on interface en0)
22/04/06 01:06:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/06 01:06:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
def giao():
    file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
    # (file_name, ProvDocument)
    document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
    # (file_name, Graphic_encoding_of_ProvDocument)
    #encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
    encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
    # (file_name, prov_types of nodes)
    if forward:
        types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
        # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
    else:
        types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
    # (file_name, prov_types occurence in the graph)
    types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
    # All prov_types in this collection of graphs
    all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
    # Number of distinct prov_types
    types_count = len(all_types)
    print(types_count)
    # index_map for prov_types, prov_type -> index
    index_map = {all_types[i]: i for i in range(types_count)}
    # index -> prov_type
    reverse_index_map = {i: all_types[i] for i in range(types_count)}
    # Contruct feature vectors for each graph
    sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
    feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
    df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
    # Standardize features
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
    scaler_model = scaler.fit(df_features)
    df_features = scaler_model.transform(df_features)
    # Change the labels
    df_labels = spark.read.csv(label_csv, header=True)
    df_labels = df_labels.replace(label_map, subset=["label"])
    df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
    # Join the features and labels
    df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
    # Split the data into training and testing
    train, test = df.randomSplit([0.8, 0.2])
    # Oversample the training data
    labels = [float(x) for x in label_map.values()]
    count = {}
    for x in labels:
        count[x] = train.filter(train['label'] == x).count()
    maxValue = max(count.values())
    ratio = {}
    for x in labels:
        ratio[x] = maxValue/count[x]
    dataframes = []
    for x in labels:
        if(count[x] == maxValue):
            dataframes.append(train.filter(train['label'] == x))
        else:
            dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x]))
    train = dataframes[0]
    for dataframe in dataframes[1:]:
        train = train.union(dataframe)
    print(ratio)
    for x in labels:
        print(train.filter(train['label'] == x).count())
    
    
    #downsample the testing data
    count = {}
    for x in labels:
        count[x] = test.filter(test['label'] == x).count()
    minValue = min(count.values())
    ratio = {}
    for x in labels:
        ratio[x] = minValue/count[x]
    dataframes = []
    for x in labels:
        if(count[x] == minValue):
            dataframes.append(test.filter(test['label'] == x))
        else:
            dataframes.append(test.filter(test['label'] == x).sample(withReplacement=False, fraction=ratio[x]))
    test = dataframes[0]
    for dataframe in dataframes[1:]:
        test = test.union(dataframe)
    print(ratio)
    for x in labels:
        print(test.filter(test['label'] == x).count())
    
    
    
    # LinearSVC classifier
    start = time.time()
    svc = LinearSVC(maxIter = 100, threshold=0.0)

    pipeline = Pipeline(stages=[svc])
    paramGrid = ParamGridBuilder().addGrid(svc.regParam, [0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 200]).build()
    # train the model and select the best model using "metricName"(hyperparameter tuning)
    crossval = CrossValidator(
        estimator=pipeline, 
        estimatorParamMaps=paramGrid, 
        evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
        numFolds=10,
        collectSubModels=True)
    cvModel = crossval.fit(train)

    # predict the labels of test data
    res_test = cvModel.bestModel.transform(test)

    # convert to dataframe and compute the metrics
    preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
    metrics = MulticlassMetrics(preds_and_labels)
    print(metrics.accuracy)
    print(metrics.confusionMatrix().toArray())
    print(list(zip(cvModel.avgMetrics, paramGrid)))
    end = time.time()
    print(end - start)



    start = time.time()
    rf = RandomForestClassifier(labelCol="label", featuresCol="features")

    pipeline = Pipeline(stages=[rf])
    paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
    # train the model and select the best model using "metricName"(hyperparameter tuning)
    crossval = CrossValidator(
        estimator=pipeline, 
        estimatorParamMaps=paramGrid, 
        evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
        numFolds=10,
        collectSubModels=True)
    cvModel = crossval.fit(train)
    # predict the labels of test data
    res_test = cvModel.bestModel.transform(test)
    # convert to dataframe and compute the metrics
    preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
    metrics = MulticlassMetrics(preds_and_labels)
    print(metrics.accuracy)
    print(metrics.confusionMatrix().toArray())
    print(list(zip(cvModel.avgMetrics, paramGrid)))
    end = time.time()
    print(end - start)
    

    start = time.time()
    gbt = GBTClassifier(labelCol="label", featuresCol="features")
    pipeline = Pipeline(stages=[gbt])
    paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
    # train the model and select the best model using "metricName"(hyperparameter tuning)
    crossval = CrossValidator(
        estimator=pipeline, 
        estimatorParamMaps=paramGrid, 
        evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
        numFolds=10,
        collectSubModels=True)
    cvModel = crossval.fit(train)
    # predict the labels of test data
    res_test = cvModel.bestModel.transform(test)
    # convert to dataframe and compute the metrics
    preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
    metrics = MulticlassMetrics(preds_and_labels)
    print(metrics.accuracy)
    print(metrics.confusionMatrix().toArray())
    print(list(zip(cvModel.avgMetrics, paramGrid)))
    end = time.time()
    print(end - start)
    
    start = time.time()
    lr = LogisticRegression(labelCol="label", featuresCol="features")
    pipeline = Pipeline(stages=[lr])
    paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [5,10]).addGrid(lr.regParam, [0.2,0.3,0.4]).addGrid(lr.elasticNetParam, [0.6,0.7,0.8]).build()
    # train the model and select the best model using "metricName"(hyperparameter tuning)
    crossval = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=MulticlassClassificationEvaluator(metricName="accuracy"),
        numFolds=10,
        collectSubModels=True)
    cvModel = crossval.fit(train)
    # predict the labels of test data
    res_test = cvModel.bestModel.transform(test)
    # convert to dataframe and compute the metrics
    preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
    metrics = MulticlassMetrics(preds_and_labels)
    print(metrics.accuracy)
    print(metrics.confusionMatrix().toArray())
    print(list(zip(cvModel.avgMetrics, paramGrid)))
    end = time.time()
    print(end - start)

In [4]:
specific_types_edge = True
specific_types_node = True
level = 4
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 4
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 4
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 4
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 5
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 5
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 5
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 5
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 6
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 6
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 6
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 6
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 2
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 2
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 2
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 2
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 1
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 1
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 1
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 1
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 0
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 0
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 0
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 0
giao()

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/PG-T/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/PG-T/graphs.csv"
label_map = {
    "Valor": "0.0",
    "Instinct": "1.0",
    "Mystic": "2.0"
}

In [None]:
specific_types_edge = True
specific_types_node = True
level = 1
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 1
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 1
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 1
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 2
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 2
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 2
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 2
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 3
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 3
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 3
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 3
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 4
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 4
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 4
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 4
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 5
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 5
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 5
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 5
giao()

In [None]:
specific_types_edge = True
specific_types_node = True
level = 6
giao()

In [None]:
specific_types_edge = False
specific_types_node = True
level = 6
giao()

In [None]:
specific_types_edge = True
specific_types_node = False
level = 6
giao()

In [None]:
specific_types_edge = False
specific_types_node = False
level = 6
giao()

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = True
level = 4
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = False
level = 4
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = True
level = 5
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = False
level = 5
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
print(types_count)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = True
level = 5
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = False
level = 5
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = True
level = 6
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = False
level = 6
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = True
level = 6
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = False
level = 6
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = True
level = 7
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = True
specific_types_node = False
level = 7
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = True
level = 7
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/CM-Buildings/graphs.csv"
specific_types_edge = False
specific_types_node = False
level = 7
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Trusted": "1.0",
    "Uncertain": "0.0"
}
# label_map = {
#     "Valor": 0.0,
#     "Instinct": 1.0,
#     "Mystic": 2.0
# }
# spark.sparkContext.addPyFile("functions.py")
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))
# (file_name, Graphic_encoding_of_ProvDocument)
#encoding_rdd = document_rdd.map(lambda x: (x[0], document_to_encoding(x[1],iri,forward)))
encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
# (file_name, prov_types of nodes)
if forward:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
    # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
else:
    types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# Number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}
# Contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)
# Change the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.replace(label_map, subset=["label"])
df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
# Join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
# Split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)
# Oversample the data
labels = [float(x) for x in label_map.values()]
count = {}
for x in labels:
    count[x] = train.filter(train['label'] == x).count()
maxValue = max(count.values())
ratio = {}
for x in labels:
    ratio[x] = maxValue/count[x]
dataframes = []
for x in labels:
    if(count[x] == maxValue):
        dataframes.append(train.filter(train['label'] == x))
    else:
        dataframes.append(train.filter(train['label'] == x).sample(withReplacement=True, fraction=ratio[x], seed=123456))
train = dataframes[0]
for dataframe in dataframes[1:]:
    train = train.union(dataframe)
print(ratio)
for x in labels:
    print(train.filter(train['label'] == x).count())
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01,0.001,0.0001,0.00001]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5,10]).addGrid(gbt.maxDepth, [3,4,5,6]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=10,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))