In [1]:
# PySpark 
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
# functions
from functions import *
import time
# ML
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, GBTClassifier, LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StandardScaler


In [2]:
conf = SparkConf().setAppName("spark").setMaster("local[*,20]").set("spark.driver.memory", "10g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
iri = False
forward = False
qualified_names = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}
label_map = {
    "Valor": 0.0,
    "Instinct": 1.0,
    "Mystic": 2.0
}


22/04/06 21:58:22 WARN Utils: Your hostname, cuiyeshuaideMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.31.122 instead (on interface en0)
22/04/06 21:58:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/06 21:58:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/06 21:58:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
def giao():
    file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)
    
    encoding_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_encoding(x[1],iri,forward,qualified_names)))
    # (file_name, prov_types of nodes)
    if forward:
        types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate(x[1], level, specific_types_node, specific_types_edge)))
        # types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_mixed(x[1], level, specific_types_node, specific_types_edge)))
    else:
        types_rdd = encoding_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, specific_types_node, specific_types_edge)))
    # (file_name, prov_types occurence in the graph)
    types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))
    # All prov_types in this collection of graphs
    all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
    # Number of distinct prov_types
    types_count = len(all_types)
    print(types_count)
    # index_map for prov_types, prov_type -> index
    index_map = {all_types[i]: i for i in range(types_count)}
    # index -> prov_type
    # Contruct feature vectors for each graph
    sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
    feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))
    df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
    # Standardize features
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
    scaler_model = scaler.fit(df_features)
    df_features = scaler_model.transform(df_features)
    # Change the labels
    df_labels = spark.read.csv(label_csv, header=True)
    df_labels = df_labels.replace(label_map, subset=["label"])
    l = list(label_map.values())
    df_labels = df_labels.where(df_labels.label.isin(l))
    df_labels = df_labels.withColumn("label", df_labels["label"].cast(DoubleType()))
    # Join the features and labels
    df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")
    # Oversample the training data
    labels = [float(x) for x in label_map.values()]
    count = {}
    for x in labels:
        count[x] = df.filter(df['label'] == x).count()
    maxValue = max(count.values())
    print(maxValue)
    ratio = {}
    for x in labels:
        ratio[x] = maxValue/count[x]
    dataframes = []
    for x in labels:
        if(count[x] == maxValue):
            dataframes.append(df.filter(df['label'] == x))
        else:
            dataframes.append(df.filter(df['label'] == x).sample(withReplacement=True, fraction=ratio[x]))
    train = dataframes[0]
    for dataframe in dataframes[1:]:
        train = train.union(dataframe)
    
    result = []



    start = time.time()
    rf = RandomForestClassifier(labelCol="label", featuresCol="features")

    pipeline = Pipeline(stages=[rf])
    paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5,10,20,30]).addGrid(rf.maxDepth, [4,5,6]).build()
    # train the model and select the best model using "metricName"(hyperparameter tuning)
    crossval = CrossValidator(
        estimator=pipeline, 
        estimatorParamMaps=paramGrid, 
        evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
        numFolds=10,
        collectSubModels=True)
    cvModel = crossval.fit(train)
    end = time.time()
    result.append(max(cvModel.avgMetrics))
    result.append(end-start)
    print(list(zip(cvModel.avgMetrics, paramGrid)))
    print(end-start)

    start = time.time()
    lr = LogisticRegression(labelCol="label", featuresCol="features")
    pipeline = Pipeline(stages=[lr])
    paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [5,10]).addGrid(lr.regParam, [0.2,0.3,0.4]).addGrid(lr.elasticNetParam, [0.6,0.7,0.8]).build()
    # train the model and select the best model using "metricName"(hyperparameter tuning)
    crossval = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=MulticlassClassificationEvaluator(metricName="accuracy"),
        numFolds=10,
        collectSubModels=True)
    cvModel = crossval.fit(train)
    end = time.time()
    print(end-start)
    result.append(max(cvModel.avgMetrics))
    result.append(end-start)
    print(list(zip(cvModel.avgMetrics, paramGrid)))
    print(result)
    

In [4]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/PG-D/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/PG-D/graphs.csv"
label_map = {
    "Valor": "0.0",
    "Instinct": "1.0",
    "Mystic": "2.0"
}

In [5]:
specific_types_edge = True
specific_types_node = True
level = 0
giao()

                                                                                

8


                                                                                

409


                                                                                

[(0.6120854244870912, {Param(parent='RandomForestClassifier_3578133998ad', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_3578133998ad', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4}), (0.6172265290429932, {Param(parent='RandomForestClassifier_3578133998ad', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_3578133998ad', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5}), (0.6358142562825951, {Param(parent='RandomForestClassifier_3578133998ad', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_3578133998ad', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 mea

22/04/06 21:59:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/04/06 21:59:28 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


81.15801095962524
[(0.5945858852653709, {Param(parent='LogisticRegression_c16b3cc232e7', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='LogisticRegression_c16b3cc232e7', name='regParam', doc='regularization parameter (>= 0).'): 0.2, Param(parent='LogisticRegression_c16b3cc232e7', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.6}), (0.5945858852653709, {Param(parent='LogisticRegression_c16b3cc232e7', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='LogisticRegression_c16b3cc232e7', name='regParam', doc='regularization parameter (>= 0).'): 0.2, Param(parent='LogisticRegression_c16b3cc232e7', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.7}), (0.5945858852653709, {Param(parent='LogisticRegression_c16b3cc2

In [6]:
specific_types_edge = False
specific_types_node = True
level = 0
giao()

8
409


                                                                                

[(0.6060440028537333, {Param(parent='RandomForestClassifier_4a63d1f787a3', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_4a63d1f787a3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4}), (0.6167977561845926, {Param(parent='RandomForestClassifier_4a63d1f787a3', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_4a63d1f787a3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5}), (0.6141445481095009, {Param(parent='RandomForestClassifier_4a63d1f787a3', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_4a63d1f787a3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 mea

In [7]:
specific_types_edge = True
specific_types_node = False
level = 0
giao()

2
409


                                                                                

[(0.4571844861923083, {Param(parent='RandomForestClassifier_b03cc83bf228', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_b03cc83bf228', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4}), (0.49585280499720197, {Param(parent='RandomForestClassifier_b03cc83bf228', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_b03cc83bf228', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5}), (0.5051995141789384, {Param(parent='RandomForestClassifier_b03cc83bf228', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_b03cc83bf228', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 me

In [8]:
specific_types_edge = False
specific_types_node = False
level = 0
giao()

2
409


                                                                                

[(0.4560399480234429, {Param(parent='RandomForestClassifier_5ac023920b02', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_5ac023920b02', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 4}), (0.4918434132208586, {Param(parent='RandomForestClassifier_5ac023920b02', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_5ac023920b02', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5}), (0.4962124766197132, {Param(parent='RandomForestClassifier_5ac023920b02', name='numTrees', doc='Number of trees to train (>= 1).'): 5, Param(parent='RandomForestClassifier_5ac023920b02', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 mea

                                                                                

81.94633722305298
[(0.3504391002629231, {Param(parent='LogisticRegression_f5f0119ffc57', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='LogisticRegression_f5f0119ffc57', name='regParam', doc='regularization parameter (>= 0).'): 0.2, Param(parent='LogisticRegression_f5f0119ffc57', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.6}), (0.3504391002629231, {Param(parent='LogisticRegression_f5f0119ffc57', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent='LogisticRegression_f5f0119ffc57', name='regParam', doc='regularization parameter (>= 0).'): 0.2, Param(parent='LogisticRegression_f5f0119ffc57', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.7}), (0.3504391002629231, {Param(parent='LogisticRegression_f5f0119f