# Python wrapper for Java VPMF implementation

## Built-in I/O assumptions

* Input must be a file from the filesystem with integers (not word tokens)
* Output must be a file written to disk with integers

## Method

1. Read in input file that contains string data
1. Create dictionary with mapping from unique strings to unique integers
1. Use dictionary to convert token strings to integers and store as *input.txt*
1. Run VSMP, which creates *output.txt* with integer data
1. Read in *output.txt* and convert integers back to work tokens

## Data samples

1. cat1.txt, cat2.txt, cat3.txt

In [1]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [2]:
# NO INTERNAL REFERENCE
# from https://github.com/fandu/maximal-sequential-patterns-mining
import subprocess


class Vmsp:
    def __init__(self):
        self._executable = "spmf.jar"
        self._input = "input.txt"
        self._output = "output.txt"

    def run(self, min_supp=1): # originally min_supp=0.5; change to 1 means must be in all witnesses
        # java -jar spmf.jar run VMSP contextPrefixSpan.txt output.txt 50%
        subprocess.call(["java", "-jar", self._executable, "run", "VMSP", self._input, self._output, str(min_supp)])

    def encode_input(self, data):
        pass

    def decode_output(self):
        # read
        lines = []
        try:
            with open(self._output, "r") as f: # modified to remove deprecated U mode
                lines = f.readlines()
        except:
            print("read_output error") # modified to add parentheses

        # decode
        patterns = []
        for line in lines:
            line = line.strip()
            patterns.append(line.split(" -1 "))

        return patterns

In [3]:
# Create list of lists with word tokens
# text_data = []
#
# for i in range(1,4): # Small example with cats
#     with open('cat' + str(i) + '.txt', 'r') as f:
#         text_data.append([token for token in f.read().split()])
#
# Find all six Darwin witnesses; filename is darwin18\d\d.txt
# Each paragraph is one line
#
# import glob
# filenames = glob.glob('darwin18??.txt')
# for filename in filenames[:2]: # each of six files
#     file_tokens = [] # all tokens for single file
#     with open(filename, 'r') as f:
#         for paragraph in range(2): # read two paragraphs
#             file_tokens.extend([token for token in f.readline().rstrip().split()])
#         text_data.append(file_tokens)
# print(len(text_data))
# print(text_data)

In [4]:
# Create integer data
# token_to_integer = {}
# integer_data = []
# for witness_data in text_data:
#     witness_integers = []
#     for token in witness_data:
#         if token not in token_to_integer:
#             token_to_integer[token] = len(token_to_integer) # add value to dictionary, use len() for unique value
#         witness_integers.append(str(token_to_integer[token]))
#     integer_data.append(witness_integers)
# print(integer_data)
# print(token_to_integer)

In [5]:
# Write integer data to disk as 'input.txt'
# Each witness is a line
# with open('input.txt', 'w') as f:
#     for witness in integer_data:
#         f.write(" -1 ".join(witness))
#         f.write(' -1 -2\n')

In [6]:
# Check new integer file
# with open('input.txt', 'r') as f:
#     lines = f.readlines()
#     print(lines)

In [7]:
# Do the work
if __name__ == "__main__":
    vmsp = Vmsp()
    vmsp.encode_input([])
    vmsp.run()
    print(vmsp.decode_output()) # modified to add parentheses

Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/home/user/tmp -Xms64m


>/home/user/our_experiment/spmf.jar


An error while trying to run the algorithm. 
 ERROR MESSAGE = java.lang.OutOfMemoryError: Java heap space
[]


java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.BitSet.initWords(BitSet.java:167)
	at java.base/java.util.BitSet.<init>(BitSet.java:162)
	at ca.pfv.spmf.algorithms.sequentialpatterns.spam.Bitmap.<init>(Bitmap.java:63)
	at ca.pfv.spmf.algorithms.sequentialpatterns.spam.AlgoVMSP.vmsp(AlgoVMSP.java:270)
	at ca.pfv.spmf.algorithms.sequentialpatterns.spam.AlgoVMSP.runAlgorithm(AlgoVMSP.java:134)
	at ca.pfv.spmf.algorithmmanager.descriptions.DescriptionAlgoVMSP.runAlgorithm(DescriptionAlgoVMSP.java:70)
	at ca.pfv.spmf.gui.CommandProcessor.runAlgorithm(CommandProcessor.java:385)
	at ca.pfv.spmf.gui.Main.processCommandLineArguments(Main.java:128)
	at ca.pfv.spmf.gui.Main.main(Main.java:54)


In [8]:
# Convert integer output back to strings
# integer_to_token = {v:k for k, v in token_to_integer.items()} # invert dictionary to decode
# results = [] # hold results as list of lists
# with open('output.txt', 'r') as f:
#     for line in f:
#         results.append([integer_to_token[int(token)] for token in line.split()[:-2] if int(token) != -1])
# print(results)