In [None]:
# -*- coding: utf-8 -*-
from hyperwords import bible2pairs as b2p
from hyperwords import counts2vocab as c2v
from hyperwords import text2numpy_nonewline as t2np
import sys
import codecs
import os
import subprocess
import collections

In [None]:
def create_pairs(path):
    # python2 hyperwords/bible2pairs.py 2 ${1} > ./${1}/pairs
    if not os.path.exists(path):
        os.makedirs(path)
    pairs_path = path + 'pairs'
    if os.path.exists(pairs_path):
        os.remove(pairs_path)
    f_pairs = open(pairs_path, 'a')
    b2p.bible2pairs(2, path, f_output=f_pairs)
    f_pairs.close()
    
    return [path, pairs_path]

In [None]:
def create_counts(path, pairs_path):
    # scripts/pairs2counts.sh ./${1}/pairs > ./${1}/counts

    # Re-implement pairs2counts.sh in python:
    #     sort -T ~/. $1 | uniq -c

    # sort -T ~/. [file]
    #    Sorts the contents of a text file, line by line; 
    #    -T Use current dir for temporaries, not $TMPDIR or /tmp.
    # uniq -c
    #    Reports or filters out repeated lines in a file; 
    #    -c Prefix lines with a number representing how many times they occurred.

    f_pairs = open(pairs_path, 'r')
    vocab_counter = collections.Counter()
    print('Starting reading pairs into vocab counter')
    for line in f_pairs.readlines():
        vocab_counter.update([line])
    f_pairs.close()
    print('Finished reading pairs into vocab counter')

    counts_path = path + 'counts'
    if os.path.exists(counts_path):
        os.remove(counts_path)
        
    print('Starting outputting vocab counter to file')
    f_counts = open(counts_path, 'a')
    for item, count in sorted(vocab_counter.items()):
        f_counts.write('      ' + str(count) + ' ' + item)
    f_counts.close()
    print('Finished outputting vocab counter to file')
    
    return counts_path

In [None]:
def make_sgns_dir(path, sgns_dir, indx):
    # mkdir ./${1}/sgns
    sgns_path = path + sgns_dir + '_' + str(indx) + '/'
    if not os.path.exists(sgns_path):
        os.makedirs(sgns_path)
    return sgns_path

In [None]:
def run_word2vecf(path, sgns_path):
    # word2vecf/word2vecf 
    #    -train ./${1}/pairs 
    #    -pow 0.75 
    #    -cvocab ./${1}/counts.contexts.vocab 
    #    -wvocab ./${1}/counts.words.vocab 
    #    -dumpcv ./${1}/sgns/contexts 
    #    -output ./${1}/sgns/words 
    #    -threads 4 
    #    -negative 1 
    #    -iters 100 
    #    -size 500
    
    counts_contexts_path = path + 'counts.contexts.vocab'
    counts_words_path = path + 'counts.words.vocab'
    sgns_contexts_path = sgns_path + 'contexts'
    sgns_words_path = sgns_path + 'words'
    sgns_log_path = sgns_path + 'log'

    args = ['word2vecf/word2vecf', '-train', pairs_path, '-pow', '0.75', 
            '-cvocab', counts_contexts_path, '-wvocab', counts_words_path,
           '-dumpcv', sgns_contexts_path, '-output', sgns_words_path,
           '-threads', '4', '-negative', '1', '-iters', '100', '-size', '500']

    # subprocess.check_call(args, *, stdin=None, stdout=None, stderr=None, shell=False)
    # subprocess.Popen(args, bufsize=0, executable=None, stdin=None, stdout=None, stderr=None, 
    #    preexec_fn=None, close_fds=False, shell=False, cwd=None, env=None, universal_newlines=False, 
    #    startupinfo=None, creationflags=0)
    print('Calling word2vecf with args: ' + str(args))
    with open(sgns_log_path, 'a') as log_out:
        subprocess.check_call(args, stdout=log_out, shell=False)
    
    return [sgns_contexts_path, sgns_words_path]

In [None]:
def vecs_to_numpy(sgns_path):
    # for v in `ls ./${1}/sgns/*.vecs`; do python2 hyperwords/text2numpy_nonewline.py ${v}; done
    for filename in os.listdir(sgns_path):
        if filename.endswith(".vecs"):
            [npy_file, vocab_file] = t2np.text2numpy_nonewline(sgns_path + '/' + filename)
            print [npy_file, vocab_file]