In [1]:
import math
import csv
from genex.parse import generate_source
from genex.Gcluster_utils import _isOverlap

import genex.database.genex_database as gxdb
from genex.preprocess import min_max_normalize
from genex.utils import normalize_sequence

import heapq
import time
from genex.cluster import sim_between_seq
import matplotlib.pyplot as plt

fn = 'SART2018_HbO_40.csv'

input_list = generate_source(fn, feature_num=5)
input_list = input_list[:24]
normalized_input_list, global_max, global_min = min_max_normalize(input_list)

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *

num_cores = 8

conf = SparkConf(). \
    setMaster("local[" + str(num_cores) + "]"). \
    setAppName("Genex").set('spark.driver.memory', '15G'). \
    set('spark.driver.maxResultSize', '15G')
sc = SparkContext(conf=conf)
spark = SQLContext.getOrCreate(sc)
input_rdd = sc.parallelize(normalized_input_list, numSlices=num_cores)
partition_input = input_rdd.glom().collect()

from genex.preprocess import all_sublists_with_id_length

gstart_time = time.time()
group_rdd = input_rdd.flatMap(
    lambda x: all_sublists_with_id_length(x, [260]))
partition_group = group_rdd.glom().collect()
gend_time = time.time()
gtime = gend_time - gstart_time
gtime = time.strftime("%H:%M:%S", time.gmtime(gtime))
print("Group time:------------------------------ " + str(gtime))
file = open('time_log.txt', 'a')
file.write('Group time is :' + gtime)
file.close()

from genex.cluster import filter_cluster

start_time = time.time()
cluster_rdd = group_rdd.mapPartitions(lambda x: filter_cluster(groups=x, st=0.05, log_level=1),
                                      preservesPartitioning=False).cache()
cluster_partition = cluster_rdd.glom().collect()
testEle = next(iter(cluster_partition[0][0][1].items()))

Group time:------------------------------ 00:00:01


In [2]:
idStruct = StructType([
        (StructField("Subject Name", StringType())),
        (StructField("Event Name", StringType())),
        (StructField("Channel Name", StringType())),
        (StructField("Start time", StringType())),
        (StructField("End Time", StringType()))
    ])

schema = StructType([
 StructField("id", idStruct, True),
 StructField("start", IntegerType(), True),
 StructField("end", IntegerType(), True)])
# 
# schema_arr = StructType(
#     [StructField("mem", ArrayType())]
# )
schema_nested = StructType(
    [
        StructField("repres", schema),
        StructField("mem", ArrayType(schema))
    ]
)

In [3]:
def formSchema(genex_sequence):
    id = [{'Subject Name': genex_sequence.id[0]}, 
      {'Event Name': genex_sequence.id[1]}, 
      {'Channel Name': genex_sequence.id[2]}, 
      {'Start time': genex_sequence.id[3]}, 
      {'End Time': genex_sequence.id[4]}]
    d = {'id': id, 'start':  genex_sequence.start, 'end':  genex_sequence.end}
    return d



In [6]:
data_list = list(map(lambda x: formSchema(x), testEle[1]))
single_data = formSchema(testEle[0])

 
# df = spark.createDataFrame(vals, columns)
# 
# newRow = spark.createDataFrame([(4,5,7)], columns)
# appended = df.union(newRow)

In [6]:
data_frame = spark.createDataFrame(sc.emptyRDD(), schema_nested)
mem_ls = []
for cluster in cluster_partition:
    for clu in cluster:
        for repre, mem_list in clu[1].items():
            represnetative = formSchema(repre)
            for mem in mem_list:
                mem_ls.append(formSchema(mem))
            newRow = spark.createDataFrame([(represnetative, mem_ls)],schema_nested)
            data_frame = data_frame.union(newRow)
            mem_ls = []

In [7]:
data_frame.show()

+--------------------+--------------------+
|              repres|                 mem|
+--------------------+--------------------+
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subject Name=...|
|[[{Subject Name=1...|[[[{Subjec

In [11]:
single_data_frame = spark.createDataFrame([single_data],schema)
# temp = spark.createDataFrame(Row(**x) for x in data_list)
# temp.show(truncate=False)
single_data_frame.printSchema()

root
 |-- id: struct (nullable = true)
 |    |-- Subject Name: string (nullable = true)
 |    |-- Event Name: string (nullable = true)
 |    |-- Channel Name: string (nullable = true)
 |    |-- Start time: string (nullable = true)
 |    |-- End Time: string (nullable = true)
 |-- start: integer (nullable = true)
 |-- end: integer (nullable = true)



In [7]:
data_frame = spark.createDataFrame([(single_data,data_list)], schema_nested).show()

+--------------------+--------------------+
|              repres|                 mem|
+--------------------+--------------------+
|[[{Subject Name=1...|[[[{Subject Name=...|
+--------------------+--------------------+



In [43]:
single_data_frame = spark.createDataFrame(data_list,schema)
single_data_frame.show()

+--------------------+-----+---+
|                  id|start|end|
+--------------------+-----+---+
|[{Subject Name=10...|    7|266|
|[{Subject Name=10...|    2|261|
|[{Subject Name=10...|    6|265|
|[{Subject Name=10...|   11|270|
|[{Subject Name=10...|    8|267|
|[{Subject Name=10...|    3|262|
|[{Subject Name=10...|    0|259|
|[{Subject Name=10...|    9|268|
|[{Subject Name=10...|    5|264|
|[{Subject Name=10...|   13|272|
|[{Subject Name=10...|   12|271|
|[{Subject Name=10...|   10|269|
|[{Subject Name=10...|    1|260|
|[{Subject Name=10...|    4|263|
+--------------------+-----+---+



In [36]:
# from pyspark.sql import Row
# Row(**cluster_partition[0][0][1])
# readjson? 
temp.select("id").show()

+--------------------+
|                  id|
+--------------------+
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
|[[Subject Name ->...|
+--------------------+



In [21]:
 data_list = data_schema_list = list(map(lambda x: formSchema(x), testEle[1]))

In [22]:
# schema_dict = {'fields': [
#     {'metadata': {}, 'name': 'id', 'nullable': True, 'type': 'integer'},
#     {'metadata': {}, 'name': 'created_at', 'nullable': True, 'type': 'timestamp'}
# ], 'type': 'struct'}
# StructType.fromJson(schema_dict)

<map at 0x11664f080>

In [35]:
cluster_partition

[[(260,
   {<genex.classes.Sequence.Sequence at 0x116907a20>: [<genex.classes.Sequence.Sequence at 0x116907a20>],
    <genex.classes.Sequence.Sequence at 0x1169076d8>: [<genex.classes.Sequence.Sequence at 0x1169076d8>,
     <genex.classes.Sequence.Sequence at 0x116907780>,
     <genex.classes.Sequence.Sequence at 0x1169077b8>,
     <genex.classes.Sequence.Sequence at 0x1169077f0>,
     <genex.classes.Sequence.Sequence at 0x116907828>,
     <genex.classes.Sequence.Sequence at 0x116907860>,
     <genex.classes.Sequence.Sequence at 0x116907898>,
     <genex.classes.Sequence.Sequence at 0x1169078d0>,
     <genex.classes.Sequence.Sequence at 0x116907908>,
     <genex.classes.Sequence.Sequence at 0x116907940>,
     <genex.classes.Sequence.Sequence at 0x116907978>,
     <genex.classes.Sequence.Sequence at 0x1169079b0>,
     <genex.classes.Sequence.Sequence at 0x1169079e8>],
    <genex.classes.Sequence.Sequence at 0x116907550>: [<genex.classes.Sequence.Sequence at 0x116907550>,
     <genex.cla