In [None]:
# PySpark 
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, MapType, ArrayType, DoubleType
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import when
# Prov
import json
from collections import Counter
from prov.model import ProvDocument
# ML
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StandardScaler


In [1]:
PROV_ENTITY= "entity"
PROV_ACTIVITY= "activity"
PROV_GENERATION= "wasGeneratedBy"
PROV_USAGE= "used"
PROV_COMMUNICATION= "wasInformedBy"
PROV_START= "wasStartedBy"
PROV_END= "wasEndedBy"
PROV_INVALIDATION= "wasInvalidatedBy"
PROV_DERIVATION= "wasDerivedFrom"
PROV_AGENT= "agent"
PROV_ATTRIBUTION= "wasAttributedTo"
PROV_ASSOCIATION= "wasAssociatedWith"
PROV_DELEGATION= "actedOnBehalfOf"
PROV_INFLUENCE= "wasInfluencedBy"
PROV_ALTERNATE= "alternateOf"
PROV_SPECIALIZATION= "specializationOf"
PROV_MENTION= "mentionOf"
PROV_MEMBERSHIP= "hadMember"
PROV_BUNDLE= "bundle"
PROV_MAP = {
    PROV_ENTITY: "entity",
    PROV_ACTIVITY: "activity",
    PROV_GENERATION: "wasGeneratedBy",
    PROV_USAGE: "used",
    PROV_COMMUNICATION: "wasInformedBy",
    PROV_START: "wasStartedBy",
    PROV_END: "wasEndedBy",
    PROV_INVALIDATION: "wasInvalidatedBy",
    PROV_DERIVATION: "wasDerivedFrom",
    PROV_AGENT: "agent",
    PROV_ATTRIBUTION: "wasAttributedTo",
    PROV_ASSOCIATION: "wasAssociatedWith",
    PROV_DELEGATION: "actedOnBehalfOf",
    PROV_INFLUENCE: "wasInfluencedBy",
    PROV_ALTERNATE: "alternateOf",
    PROV_SPECIALIZATION: "specializationOf",
    PROV_MEMBERSHIP: "hadMember",
    PROV_BUNDLE: "bundle",
}
PROV_N_MAP = {
    "Entity": PROV_ENTITY,
    "Activity": PROV_ACTIVITY,
    "Generation": PROV_GENERATION,
    "Usage": PROV_USAGE,
    "Communication": PROV_COMMUNICATION,
    "Start": PROV_START,
    "End": PROV_END,
    "Invalidation": PROV_INVALIDATION,
    "Derivation": PROV_DERIVATION,
    "Agent": PROV_AGENT,
    "Attribution": PROV_ATTRIBUTION,
    "Association": PROV_ASSOCIATION,
    "Delegation": PROV_DELEGATION,
    "Influence": PROV_INFLUENCE,
    "Alternate": PROV_ALTERNATE,
    "Specialization": PROV_SPECIALIZATION,
    "Membership": PROV_MEMBERSHIP,
    "Bundle": PROV_BUNDLE,
}
PROV_NODE = [
    PROV_ENTITY,
    PROV_ACTIVITY,
    PROV_AGENT
]
PROV_EDGE = [
    PROV_GENERATION,
    PROV_USAGE,
    PROV_COMMUNICATION,
    PROV_START,
    PROV_END,
    PROV_INVALIDATION,
    PROV_DERIVATION,
    PROV_ATTRIBUTION,
    PROV_ASSOCIATION,
    PROV_DELEGATION,
    PROV_INFLUENCE,
    PROV_ALTERNATE,
    PROV_SPECIALIZATION,
    PROV_MEMBERSHIP
]
RELATION_MAP = {
    PROV_GENERATION: ("entity", "activity"),
    PROV_USAGE: ("activity", "entity"),
    PROV_COMMUNICATION: ("informed", "informant"),
    PROV_START: ("activity", "entity"),
    PROV_END: ("activity", "entity"),
    PROV_INVALIDATION: ("entity", "activity"),
    PROV_DERIVATION: ("generatedEntity", "usedEntity"), # Revision
    PROV_ATTRIBUTION: ("entity", "agent"),
    PROV_ASSOCIATION: ("activity", "agent"), # agent plan
    PROV_DELEGATION: ("delegate", "responsible"),
    PROV_INFLUENCE: ("influencee", "influencer"),
    PROV_ALTERNATE: ("prov:alternate1", "prov:alternate2"),
    PROV_SPECIALIZATION: ("specificEntity", "generalEntity"),
    PROV_MEMBERSHIP: ("collection", "entity")
}
PROV_RECORD_IDS_MAP = dict(
    (PROV_MAP[rec_type_id], rec_type_id) for rec_type_id in PROV_MAP
)
PROV_RECORD_IDS_MAP_1 = dict(
    (rec_type_id, PROV_N_MAP[rec_type_id]) for rec_type_id in PROV_N_MAP
)
# str -> constants
print(PROV_RECORD_IDS_MAP)
print(PROV_RECORD_IDS_MAP_1)

{'entity': 'entity', 'activity': 'activity', 'wasGeneratedBy': 'wasGeneratedBy', 'used': 'used', 'wasInformedBy': 'wasInformedBy', 'wasStartedBy': 'wasStartedBy', 'wasEndedBy': 'wasEndedBy', 'wasInvalidatedBy': 'wasInvalidatedBy', 'wasDerivedFrom': 'wasDerivedFrom', 'agent': 'agent', 'wasAttributedTo': 'wasAttributedTo', 'wasAssociatedWith': 'wasAssociatedWith', 'actedOnBehalfOf': 'actedOnBehalfOf', 'wasInfluencedBy': 'wasInfluencedBy', 'alternateOf': 'alternateOf', 'specializationOf': 'specializationOf', 'hadMember': 'hadMember', 'bundle': 'bundle'}
{'Entity': 'entity', 'Activity': 'activity', 'Generation': 'wasGeneratedBy', 'Usage': 'used', 'Communication': 'wasInformedBy', 'Start': 'wasStartedBy', 'End': 'wasEndedBy', 'Invalidation': 'wasInvalidatedBy', 'Derivation': 'wasDerivedFrom', 'Agent': 'agent', 'Attribution': 'wasAttributedTo', 'Association': 'wasAssociatedWith', 'Delegation': 'actedOnBehalfOf', 'Influence': 'wasInfluencedBy', 'Alternate': 'alternateOf', 'Specialization': 's

In [None]:
json_folder = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/PG-D/*.json"
label_csv = "/Users/cuiyeshuai/Documents/UG modules/Individual Project/provenance-kernel-evaluation-master/datasets/PG-D/graphs.csv"
additional_types_edge = True
additional_types_node = True
level = 3
uri = False
forward = False
qualified_name = {
    "xsd:QName",
    "prov:QUALIFIED_NAME"
}

In [None]:
def document_to_records(x: ProvDocument, uri = uri) -> list:
    output = ({}, {})  # (nodes, edges)
    records = x.get_records()
    for record in records:
        if record.is_element():
            # elements will be encoded into id -> (generic_type, specific_type)
            res = (PROV_N_MAP[record.get_type().localpart],
                   frozenset([assert_type.uri if uri else assert_type.__str__()
                    for assert_type in record.get_asserted_types()])
                   )
            output[0][record.identifier.__str__()] = res
        else:
            # relations will be encoded into start_node -> [(end_node, relation_types)]
            rec_type = PROV_N_MAP[record.get_type().localpart]
            attributes = {}  # name -> value
            for attribute in record.formal_attributes:
                attributes[attribute[0].__str__()] = attribute[1].__str__()
            relation = RELATION_MAP[rec_type]
            # get starting and ending node of this relation(edge)
            edge = attributes.get("prov:" + relation[0]), attributes.get("prov:" + relation[1])
            types_in_str = {assert_type.__str__() for assert_type in record.get_asserted_types()}
            if None in edge:
                if rec_type == PROV_DERIVATION:
                    if "prov:Revision" in types_in_str:
                        edge = attributes.get("prov:generatedEntity"), attributes.get("prov:usedEntity")
                elif rec_type == PROV_ASSOCIATION:
                    edge = (attributes.get("prov:activity"), attributes.get("prov:plan"))
            if None in edge:
                continue
            # get all relation types of this relation
            res = (rec_type, frozenset([assert_type.uri if uri else assert_type.__str__()
                              for assert_type in record.get_asserted_types()])) 
            if forward:
                if output[1].get(edge[0]) == None:  # if start_node is not already in relations
                    output[1][edge[0]] = [(edge[1], res)]
                else:
                    output[1][edge[0]].append((edge[1], res))
            else:
                if output[1].get(edge[1]) == None:  # if end_node is not already in relations
                    output[1][edge[1]] = [(edge[0], res)]
                else:
                    output[1][edge[1]].append((edge[0], res))
    return output


In [None]:
def json_to_records(x: str) -> list:
    output = ({}, {})  # (elements, relations)
    file = json.decoder.JSONDecoder().decode(x)
    if uri:
        prefix = file["prefix"]
        prefix["prov"] = "http://www.w3.org/ns/prov#"
    del file["prefix"]
    if "bundle" in file:
        del file["bundle"]
    for rec_type_str in file:
        # file is dict
        # rec_type_str is str
        rec_type = PROV_RECORD_IDS_MAP[rec_type_str]
        for rec_id, content in file[rec_type_str].items():
            # rec_id in rec_type
            if hasattr(content, "items"):  # it is a dict
                #  There is only one element, create a singleton list
                elements = [content]
            else:
                # expect it to be a list of dictionaries
                elements = content

            for element in elements:
                res, types = None, frozenset()
                if rec_type in PROV_NODE:
                    if "prov:type" in element:
                        if isinstance(element.get("prov:type"), dict):
                            element["prov:type"] = [element["prov:type"]] # make it a list
                        types = frozenset([assert_type.get("$") if assert_type.get("type") in qualified_name else None
                                 for assert_type in element["prov:type"]])
                        if uri:
                            types = frozenset([prefix[type.split(":", 1)[0]] + type.split(":", 1)[1]
                                     if type.split(":", 1)[0] in prefix else type for type in types])
                    output[0][rec_id] = (rec_type, types)
                else:
                    relation = RELATION_MAP[rec_type]
                    edge = element.get("prov:" + relation[0]), element.get("prov:" + relation[1])
                    if "prov:type" in element:
                        if isinstance(element.get("prov:type"), dict):
                            element["prov:type"] = [element["prov:type"]]
                        types = frozenset([assert_type.get("$") if assert_type.get("type") in qualified_name else None
                                 for assert_type in element["prov:type"]])
                    if None in edge:
                        if rec_type == PROV_DERIVATION:
                            if "prov:Revision" in types:
                                edge = element.get("prov:generatedEntity"), element.get("prov:usedEntity")
                        elif rec_type == PROV_ASSOCIATION:
                            edge = (element.get("prov:activity"), element.get("prov:plan"))
                    if None in edge:
                        continue
                    if uri:
                        types = frozenset([prefix[type.split(":", 1)[0]] + type.split(":", 1)[1]
                                 if type.split(":", 1)[0] in prefix else type for type in types])
                    res = (rec_type, types)
                    if forward:
                        if output[1].get(edge[0]) == None:  # if start_node is not already in relations
                            output[1][edge[0]] = [(edge[1], res)]
                        else:
                            output[1][edge[0]].append((edge[1], res))
                    else:
                        if output[1].get(edge[1]) == None:  # if end_node is not already in relations
                            output[1][edge[1]] = [(edge[0], res)]
                        else:
                            output[1][edge[1]].append((edge[0], res))
    return output

In [None]:
def type_generate_mixed(x, level, additional_types_node, additional_types_edge):
    zero_types = {}
    for node in x[0]:
        zero_types[node] = frozenset(x[0][node]) if additional_types_node else frozenset((x[0][node][0],))
    h_types = {} # prov_types up to level h
    for i in range(level + 1):
        h_types[i] = {}
    h_types[0] = {node: (zero_types[node],) for node in zero_types}
    for i in range(1, level+1):
        for source in x[1]: #iterate through all edges
            for destination, edge_type in x[1][source]:
                if destination in h_types[i-1]: #if the destination is in the previous level
                    if h_types[i].get(source) is None:
                        h_types[i][source] = ((frozenset(edge_type),) 
                            if additional_types_edge else (frozenset((edge_type[0],)),)) + h_types[i-1][destination] 
                    else: 
                        h_types[i][source] = tuple(m|n for m, n 
                            in zip(h_types[i][source], ((frozenset(edge_type),) 
                            if additional_types_edge else (frozenset((edge_type[0],)),)) + h_types[i-1][destination])) 
    return h_types

In [None]:
def type_generate(x, level, additional_types_node, additional_types_edge):
    zero_types = {}
    for node in x[0]:
        zero_types[node] = frozenset((x[0][node],)) if additional_types_node else frozenset((x[0][node][0],))
    h_types = {} # prov_types up to level h
    for i in range(level + 1):
        h_types[i] = {}
    h_types[0] = {node: (zero_types[node],) for node in zero_types}
    for i in range(1, level+1):
        for source in x[1]: #iterate through all edges
            for destination, edge_type in x[1][source]:
                if destination in h_types[i-1]: #if the destination is in the previous level
                    if h_types[i].get(source) is None:
                        h_types[i][source] = ((frozenset((edge_type,)),) if additional_types_edge else (frozenset((edge_type[0],)),)) + h_types[i-1][destination] 
                    else: 
                        h_types[i][source] = tuple(m|n for m, n in zip(h_types[i][source], ((frozenset((edge_type,)),) if additional_types_edge else (frozenset((edge_type[0],)),)) + h_types[i-1][destination])) 
    return h_types

In [None]:
def type_generate_R(x, level, additional_types_node, additional_types_edge):
    zero_types = {}
    for node in x[0]:
        zero_types[node] = frozenset((x[0][node],)) if additional_types_node else frozenset((x[0][node][0],))
    h_types = {} # prov_types up to level h
    for i in range(level + 1):
        h_types[i] = {}
    h_types[0] = {node: (zero_types[node],) for node in zero_types}
    for i in range(1, level+1):
        for destination in h_types[i-1]: # All nodes with h_types of level i-1
            if destination in x[1]: # if the node is the destination of any edge
                for source, edge_type in x[1][destination]:
                    if h_types[i].get(source) is None:
                        h_types[i][source] = ((frozenset((edge_type,)),) if additional_types_edge else (frozenset((edge_type[0],)),)) + h_types[i-1][destination] 
                    else: 
                        h_types[i][source] = tuple(m|n for m, n in zip(h_types[i][source], ((frozenset((edge_type,)),) if additional_types_edge else (frozenset((edge_type[0],)),)) + h_types[i-1][destination])) 
    return h_types

In [None]:
def count_prov_types(level, h_types):
    res = dict()  # Dict[prov_type, occurence]
    for h in range(level + 1):
        res.update(dict(Counter(h_types[h].values())))
    return res

In [None]:
def sparse_matrix(x, len_types, index_map):
    res = [0] * len_types
    for key in x:
        res[index_map[key]] = x[key]
    return res

In [None]:
# if Reverse is true then output will be features with most positive contribution
def most_important_features(x, reverse_index_map, reverse=True):
    feature_weight = [(x[i],i) for i in range(len(x))]
    feature_weight.sort(reverse=reverse)
    return [(reverse_index_map[i[1]], i[0]) for i in feature_weight]

In [None]:
conf = SparkConf().setAppName("spark").setMaster("local[8]").set("spark.driver.memory", "2g").set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [None]:
# Load the data into rdd (file_path, json_data(string))
file_and_path_rdd = spark.sparkContext.wholeTextFiles(json_folder)

In [None]:
# (file_name, ProvDocument)
document_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], (ProvDocument.deserialize(content=x[1]))))

In [None]:
# (file_name, Graphic_encoding_of_ProvDocument)
# records_rdd_json = file_and_path_rdd.map(lambda x: (x[0], json_to_records(x[1])))
records_rdd = document_rdd.map(lambda x: (x[0], document_to_records(x[1])))
records_rdd = file_and_path_rdd.map(lambda x: (x[0].split("/")[-1], json_to_records(x[1])))

In [None]:
# (file_name, prov_types of nodes)
if forward:
    types_rdd = records_rdd.map(lambda x: (x[0], type_generate(x[1], level, additional_types_node, additional_types_edge)))
else:
    types_rdd = records_rdd.map(lambda x: (x[0], type_generate_R(x[1], level, additional_types_node, additional_types_edge)))

In [None]:
# (file_name, prov_types occurence in the graph)
types_count_rdd = types_rdd.map(lambda x: (x[0], count_prov_types(level,x[1])))

In [None]:
for x in types_count_rdd.take(1):
    print(x)

In [None]:
# All prov_types in this collection of graphs
all_types = types_count_rdd.flatMap(lambda x: x[1].keys()).distinct().collect()
# number of distinct prov_types
types_count = len(all_types)
# index_map for prov_types, prov_type -> index
index_map = {all_types[i]: i for i in range(types_count)}
# index -> prov_type
reverse_index_map = {i: all_types[i] for i in range(types_count)}

In [None]:
types_count

In [None]:
# contruct feature vectors for each graph
sparse_matrix_rdd = types_count_rdd.map(lambda x: (x[0], sparse_matrix(x[1], types_count, index_map)))
feature_vector_rdd = sparse_matrix_rdd.map(lambda x: (x[0],Vectors.dense(x[1])))

In [None]:
sparse_matrix_rdd.take(20)

In [None]:
# convert to dataframe from rdd
df_features = spark.createDataFrame(feature_vector_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")

In [None]:
df_features.show()

In [None]:
# Standardize features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_features)
df_features = scaler_model.transform(df_features)

In [None]:
df_features.select("scaledFeatures").show(1,truncate=False)

In [None]:
# read the labels
df_labels = spark.read.csv(label_csv, header=True)
df_labels = df_labels.withColumn("label", when(df_labels.label == "Trusted", 1.0).otherwise(0.0))

In [None]:
# join the features and labels
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.scaledFeatures, df_labels.label).withColumnRenamed("scaledFeatures", "features")

In [None]:
# split the data into training and testing
train, test = df.randomSplit([0.8, 0.2], seed = 123456)

In [None]:
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=BinaryClassificationEvaluator(), 
    numFolds=4,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())

In [None]:
# LinearSVC classifier
svc = LinearSVC(maxIter = 100, threshold=0.0)

pipeline = Pipeline(stages=[svc])
paramGrid = ParamGridBuilder().addGrid(svc.regParam, [1, 0.1, 0.01]).addGrid(svc.maxIter, [100, 500]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=4,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())

In [None]:
svc = LinearSVC()
svc_model = svc.fit(train)
res_test = svc_model.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())

In [None]:
svc_model.coefficients

In [None]:
def most_important_features(x, reverse_index_map, reverse=True):
    feature_weight = [(x[i],i) for i in range(len(x))]
    feature_weight.sort(reverse=reverse)
    return [(reverse_index_map[i[1]], i[0]) for i in feature_weight]

In [None]:
most_important_features(svc_model.coefficients.toArray(), reverse_index_map, True)

In [None]:
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10,20,30]).build()
# train the model and select the best model using "metricName"(hyperparameter tuning)
crossval = CrossValidator(
    estimator=pipeline, 
    estimatorParamMaps=paramGrid, 
    evaluator=MulticlassClassificationEvaluator(metricName="accuracy"), 
    numFolds=4,
    collectSubModels=True)
cvModel = crossval.fit(train)

# predict the labels of test data
res_test = cvModel.bestModel.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())
print(list(zip(cvModel.avgMetrics, paramGrid)))

In [None]:
gbt = GBTClassifier(labelCol="label", featuresCol="features")
gbt_model = gbt.fit(train)
res_test = gbt_model.transform(test)

# convert to dataframe and compute the metrics
preds_and_labels = res_test.select("prediction", "label").rdd.map(lambda x: (x[0], x[1]))
metrics = MulticlassMetrics(preds_and_labels)
print(metrics.accuracy)
print(metrics.confusionMatrix().toArray())

In [None]:
df_features = spark.createDataFrame(sparse_matrix_rdd).withColumnRenamed("_1", "file").withColumnRenamed("_2", "features")
df = df_features.join(df_labels, df_features.file == df_labels.graph_file).select(df_features.features, df_labels.label)
df_list = df.collect()


In [None]:
train_list = train.collect()
test_list = test.collect()
X_train = [x[0] for x in train_list]
y_train = [x[1] for x in train_list]
X_test = [x[0] for x in test_list]
y_test = [x[1] for x in test_list]

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# X = np.array([x[0] for x in df_list])
# y = np.array([x[1] for x in df_list])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = make_pipeline(StandardScaler(),
    SVC(kernel="linear"))
clf.fit(X_train, y_train)
(clf.predict(X_test), y_test)
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

In [None]:
from sklearn.svm import LinearSVC

# X = np.array([x[0] for x in df_list])
# y = np.array([x[1] for x in df_list])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = make_pipeline(StandardScaler(),
    LinearSVC())
clf.fit(X_train, y_train)
(clf.predict(X_test), y_test)
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))