# Python wrapper for Java VPMF implementation

## Built-in I/O assumptions

* Input must be a file from the filesystem with integers (not word tokens)
* Output must be a file written to disk with integers

## Method

1. Read in input file that contains string data
1. Create dictionary with mapping from unique strings to unique integers
1. Use dictionary to convert token strings to integers and store as *input.txt*
1. Run VSMP, which creates *output.txt* with integer data
1. Read in *output.txt* and convert integers back to work tokens

## Data samples

1. cat1.txt, cat2.txt, cat3.txt

In [1]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [2]:
# NO INTERNAL REFERENCE
# from https://github.com/fandu/maximal-sequential-patterns-mining
import subprocess

# TODO: Rename class
class Vmsp:
    def __init__(self):
        self._executable = "spmf.jar"
        self._input = "input.txt"
        self._output = "output.txt"

    def run(self, min_supp=0.5): # originally min_supp=0.5; change to 1 means must be in all witnesses
        # java -jar spmf.jar run PrefixSpan contextPrefixSpan.txt output.txt 50%
              #  subprocess.call(["java", "-jar", self._executable, "run", "TSP_nonClosed", self._input, self._output, "5"])



#                subprocess.call(["java", "-jar", self._executable, "run", "TKS", self._input, self._output, "1"])
       subprocess.call(["java", "-jar", self._executable, "run", "BIDE+", self._input, self._output, "1"])




    def encode_input(self, data):
        pass

    def decode_output(self):
        # read
        lines = []
        try:
            with open(self._output, "r") as f: # modified to remove deprecated U mode
                lines = f.readlines()
        except:
            print("read_output error") # modified to add parentheses

        # decode
        patterns = []
        for line in lines:
            line = line.strip()
            patterns.append(line.split(" -1 "))

        return patterns

In [3]:
# Create list of lists with word tokens
text_data = []

# for i in range(1,4): # Small example with cats
#     with open('cat' + str(i) + '.txt', 'r') as f:
#         text_data.append([token for token in f.read().split()])
#
# Find all six Darwin witnesses; filename is darwin18\d\d.txt
# Each paragraph is one line
#
import glob
filenames = glob.glob('darwin18??.txt')
for filename in filenames[:2]: # each of six files
    file_tokens = [] # all tokens for single file
    with open(filename, 'r') as f:
        for paragraph in range(2): # read two paragraphs
            file_tokens.extend([token for token in f.readline().rstrip().split()])
        text_data.append(file_tokens)
print(len(text_data))
print(text_data)

2
[['Causes', 'of', 'Variability.', 'WHEN', 'we', 'look', 'to', 'the', 'individuals', 'of', 'the', 'same', 'variety', 'or', 'sub-variety', 'of', 'our', 'older', 'cultivated', 'plants', 'and', 'animals,', 'one', 'of', 'the', 'first', 'points', 'which', 'strikes', 'us,', 'is,', 'that', 'they', 'generally', 'differ', 'more', 'from', 'each', 'other', 'than', 'do', 'the', 'individuals', 'of', 'any', 'one', 'species', 'or', 'variety', 'in', 'a', 'state', 'of', 'nature.', 'When', 'we', 'reflect', 'on', 'the', 'vast', 'diversity', 'of', 'the', 'plants', 'and', 'animals', 'which', 'have', 'been', 'cultivated,', 'and', 'which', 'have', 'varied', 'during', 'all', 'ages', 'under', 'the', 'most', 'different', 'climates', 'and', 'treatment,', 'I', 'think', 'we', 'are', 'driven', 'to', 'conclude', 'that', 'this', 'great', 'variability', 'is', 'simply', 'due', 'to', 'our', 'domestic', 'productions', 'having', 'been', 'raised', 'under', 'conditions', 'of', 'life', 'not', 'so', 'uniform', 'as,', 'and', 

In [4]:
# Create integer data
token_to_integer = {}
integer_data = []
for witness_data in text_data:
    witness_integers = []
    for token in witness_data:
        if token not in token_to_integer:
            token_to_integer[token] = len(token_to_integer) # add value to dictionary, use len() for unique value
        witness_integers.append(str(token_to_integer[token]))
    integer_data.append(witness_integers)
print(integer_data)
print(token_to_integer)

[['0', '1', '2', '3', '4', '5', '6', '7', '8', '1', '7', '9', '10', '11', '12', '1', '13', '14', '15', '16', '17', '18', '19', '1', '7', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '7', '8', '1', '36', '19', '37', '11', '10', '38', '39', '40', '1', '41', '42', '4', '43', '44', '7', '45', '46', '1', '7', '16', '17', '47', '22', '48', '49', '50', '17', '22', '48', '51', '52', '53', '54', '55', '7', '56', '57', '58', '17', '59', '60', '61', '4', '62', '63', '6', '64', '26', '65', '66', '67', '68', '69', '70', '6', '13', '71', '72', '73', '49', '74', '55', '75', '1', '76', '77', '78', '79', '80', '17', '81', '57', '82', '83', '6', '22', '7', '84', '48', '49', '85', '55', '41', '86', '68', '87', '60', '88', '89', '90', '38', '7', '91', '92', '93', '94', '95', '26', '65', '67', '96', '97', '98', '99', '100', '101', '1', '102', '103', '104', '105', '106', '26', '107', '108', '109', '97', '85', '52', '110', '111', '6', '7', '112', '75', '1', 

In [5]:
# Write integer data to disk as 'input.txt'
# Each witness is a line
with open('input.txt', 'w') as f:
    for witness in integer_data:
        f.write(" -1 ".join(witness))
        f.write(' -1 -2\n')

In [6]:
# Check new integer file
with open('input.txt', 'r') as f:
    lines = f.readlines()
    print(lines)

['0 -1 1 -1 2 -1 3 -1 4 -1 5 -1 6 -1 7 -1 8 -1 1 -1 7 -1 9 -1 10 -1 11 -1 12 -1 1 -1 13 -1 14 -1 15 -1 16 -1 17 -1 18 -1 19 -1 1 -1 7 -1 20 -1 21 -1 22 -1 23 -1 24 -1 25 -1 26 -1 27 -1 28 -1 29 -1 30 -1 31 -1 32 -1 33 -1 34 -1 35 -1 7 -1 8 -1 1 -1 36 -1 19 -1 37 -1 11 -1 10 -1 38 -1 39 -1 40 -1 1 -1 41 -1 42 -1 4 -1 43 -1 44 -1 7 -1 45 -1 46 -1 1 -1 7 -1 16 -1 17 -1 47 -1 22 -1 48 -1 49 -1 50 -1 17 -1 22 -1 48 -1 51 -1 52 -1 53 -1 54 -1 55 -1 7 -1 56 -1 57 -1 58 -1 17 -1 59 -1 60 -1 61 -1 4 -1 62 -1 63 -1 6 -1 64 -1 26 -1 65 -1 66 -1 67 -1 68 -1 69 -1 70 -1 6 -1 13 -1 71 -1 72 -1 73 -1 49 -1 74 -1 55 -1 75 -1 1 -1 76 -1 77 -1 78 -1 79 -1 80 -1 17 -1 81 -1 57 -1 82 -1 83 -1 6 -1 22 -1 7 -1 84 -1 48 -1 49 -1 85 -1 55 -1 41 -1 86 -1 68 -1 87 -1 60 -1 88 -1 89 -1 90 -1 38 -1 7 -1 91 -1 92 -1 93 -1 94 -1 95 -1 26 -1 65 -1 67 -1 96 -1 97 -1 98 -1 99 -1 100 -1 101 -1 1 -1 102 -1 103 -1 104 -1 105 -1 106 -1 26 -1 107 -1 108 -1 109 -1 97 -1 85 -1 52 -1 110 -1 111 -1 6 -1 7 -1 112 -1 75 -1 1 -1 

In [7]:
%env _JAVA_OPTIONS=-Djava.io.tmpdir=/home/user/tmp -Xms700m


# Do the work
if __name__ == "__main__":
    vmsp = Vmsp()
    vmsp.encode_input([])
    vmsp.run()
    print(vmsp.decode_output()) # modified to add parentheses

env: _JAVA_OPTIONS=-Djava.io.tmpdir=/home/user/tmp -Xms700m


Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/home/user/tmp -Xms700m


>/home/user/our_experiment/spmf.jar


In [0]:
# Convert integer output back to strings
integer_to_token = {v:k for k, v in token_to_integer.items()} # invert dictionary to decode
results = [] # hold results as list of lists
with open('output.txt', 'r') as f:
    for line in f:
        results.append([integer_to_token[int(token)] for token in line.split()[:-2] if int(token) != -1])
print(results)