In [1]:
import os
import platform
import sys

def get_project_root() -> str:
    system = platform.system()
    return r"D:\Users\tcshore\Documents\Projects\Tangrams\Data" if system == "Windows" else "/home/tshore/Projects/tangrams-restricted/Data"

infile_path = os.path.join(get_project_root(), "wordprobseqs-inflected-small.tsv")
print("Will read file \"{}\".".format(infile_path), file=sys.stderr)

Will read file "D:\Users\tcshore\Documents\Projects\Tangrams\Data\wordprobseqs-inflected-small.tsv".


In [2]:
import csv
import pandas as pd

RESULTS_FILE_CSV_DIALECT = csv.excel_tab
# NOTE: "category" dtype doesn't work with pandas-0.21.0 but does with pandas-0.21.1
__RESULTS_FILE_DTYPES = {"DYAD": "category", "WORD": "category", "IS_TARGET": bool, "IS_OOV": bool,
				 "IS_INSTRUCTOR": bool, "SHAPE": "category", "ONLY_INSTRUCTOR": bool, "WEIGHT_BY_FREQ": bool}

def read_results_file(inpath: str, encoding: str) -> pd.DataFrame:
	print("Reading \"{}\" using encoding \"{}\".".format(inpath, encoding), file=sys.stderr)
	result = pd.read_csv(inpath, dialect=RESULTS_FILE_CSV_DIALECT, sep=RESULTS_FILE_CSV_DIALECT.delimiter,
						 float_precision="round_trip",
						 encoding=encoding, memory_map=True, dtype=__RESULTS_FILE_DTYPES)
	return result

cv_results = read_results_file(infile_path, "windows-1252")
print("Read {} cross-validation results for {} dyad(s).".format(cv_results.shape[0], cv_results["DYAD"].nunique()),
      file=sys.stderr)

Reading "D:\Users\tcshore\Documents\Projects\Tangrams\Data\wordprobseqs-inflected-small.tsv" using encoding "windows-1252".
Read 180880 cross-validation results for 2 dyad(s).


In [3]:
def find_target_ref_rows(df: pd.DataFrame) -> pd.DataFrame:
	result = df.loc[df["IS_TARGET"] == True]
	result_row_count = result.shape[0]
	complement_row_count = df.loc[~df.index.isin(result.index)].shape[0]
	assert result_row_count + complement_row_count == df.shape[0]
	print("Found {} nontarget rows and {} target rows. Ratio: {}".format(complement_row_count, result_row_count,
																		 complement_row_count / float(
																			 result_row_count)), file=sys.stderr)
	return result

cv_results = find_target_ref_rows(cv_results)

Found 171836 nontarget rows and 9044 target rows. Ratio: 19.0


In [72]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import numpy as np

# Create vocab before splitting training and testing DFs so that the word feature set is stable
#vocab_words = tuple(sorted(cv_results["WORD"].unique()))
print("Fitting one-hot encoder for vocabulary of size {}.".format(len(cv_results["WORD"].unique())), file=sys.stderr)

# https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
# integer encode
label_encoder = LabelEncoder()
vocab_labels = label_encoder.fit_transform(cv_results["WORD"])
cv_results["WORD_LABEL"] = vocab_labels
#print(vocab_labels)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
vocab_labels = vocab_labels.reshape(len(vocab_labels), 1)
onehot_encoder.fit(vocab_labels)
assert onehot_encoder.n_values_ == len(vocab_words)
#vocab_onehot_encoded = onehot_encoder.fit_transform(vocab_labels)
#print(vocab_onehot_encoded)
# invert first example
#inverted = label_encoder.inverse_transform([np.argmax(vocab_onehot_encoded[0, :])])
#print(inverted)

Fitting one-hot encoder for vocabulary of size 299.


In [73]:
from typing import Tuple

import numpy as np

def split_training_testing(df: pd.DataFrame, test_set_size : int) -> Tuple[pd.DataFrame, pd.DataFrame]:
	dyad_ids = df["DYAD"].unique()
	training_set_size = len(dyad_ids) - test_set_size
	if training_set_size < 1:
		raise ValueError("Desired test set size is {} but only {} dyads found.".format(test_set_size, len(dyad_ids)))
	else:
		training_set_dyads = frozenset(np.random.choice(dyad_ids, training_set_size))
		print("Training set dyads: {}".format(sorted(training_set_dyads)), file=sys.stderr)
		training_set_idxs = df["DYAD"].isin(training_set_dyads)
		training_set = df.loc[training_set_idxs]
		test_set = df.loc[~training_set_idxs]
		test_set_dyads = frozenset(test_set["DYAD"].unique())
		print("Test set dyads: {}".format(sorted(test_set_dyads)), file=sys.stderr)

		assert not frozenset(training_set["DYAD"].unique()).intersection(frozenset(test_set_dyads))
		return training_set, test_set
    
training_df, test_df = split_training_testing(cv_results, 1)
training_df = training_df.copy(deep=False)
test_df = test_df.copy(deep=False)

Training set dyads: ['1']
Test set dyads: ['17']


In [89]:
from typing import Iterator, List, Mapping

class SequenceMatrixFactory(object):

	def __init__(self, label_encoder, onehot_encoder):
		self.label_encoder = label_encoder
		self.onehot_encoder = onehot_encoder
    
	@property
	def feature_count(self):
		word_features = self.onehot_encoder.n_values_
		return word_features + 3

	def create_datapoint_feature_array(self, row: pd.Series) -> List[float]:
		#word_features = [0.0] * len(self.__vocab_idxs)
		# The features representing each individual vocabulary word are at the beginning of the feature vector
		#word_features[self.__vocab_idxs[row["WORD"]]] = 1.0
		#word_label = self.label_encoder.transform(row["WORD"])
		word_label = row["WORD_LABEL"]
		#print("Word label: {}".format(word_label), file=sys.stderr)
        # "OneHotEncoder.transform(..)" returns a matrix even if only a single value is passed to it, so get just the first (and only) row
		word_features = self.onehot_encoder.transform(word_label)[0]
		#print("Word features: {}".format(word_features), file=sys.stderr)
		# The word label for the one-hot encoding is that with the same index as the column that has a "1" value, i.e. the highest value in the vector of one-hot encoding values
		#inverse_label = np.argmax(word_features)
		#assert inverse_label == word_label
		#inverse_word = self.label_encoder.inverse_transform([inverse_label])
		#print("Inverse word label: {}".format(inverse_label), file=sys.stderr)
		is_instructor = 1.0 if row["IS_INSTRUCTOR"] else 0.0
		is_oov = 1.0 if row["IS_OOV"] else 0.0
		#is_target = 1.0 if row["IS_TARGET"] else 0.0
		score = row["PROBABILITY"]
		other_features = np.array((is_instructor, is_oov, score))
		#result = word_features + other_features
		result = np.concatenate((word_features, other_features))
		#print("Created a vector of {} features.".format(len(result)), file=sys.stderr)
		return result

	def __call__(self, df : pd.DataFrame) -> np.array:
		# https://stackoverflow.com/a/47815400/1391325
		df.sort_values("TOKEN_SEQ_ORDINALITY", inplace=True)
		sequence_groups = df.groupby(("CROSS_VALIDATION_ITER", "DYAD", "SPLIT_SEQ_NO", "UTT_START_TIME", "UTT_END_TIME", "ENTITY"), as_index=False)
		return np.array(tuple(tuple(self.__create_feature_vectors(seq)) for _, seq in sequence_groups))
        
	def __create_feature_vectors(self, df : pd.DataFrame) -> Iterator[List[float]]:
		# noinspection PyProtectedMember
		return (self.create_datapoint_feature_array(row._asdict()) for row in df.itertuples(index=False))

print("Splitting token sequences.", file=sys.stderr)
seq_matrix_factory = SequenceMatrixFactory(label_encoder, onehot_encoder)
training_matrix = seq_matrix_factory(training_df)
print("Created a training data matrix of shape {}.".format(training_matrix.shape), file=sys.stderr)
test_matrix = seq_matrix_factory(test_df)
print("Created a test data matrix of shape {}.".format(test_matrix.shape), file=sys.stderr)

Splitting token sequences.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 3

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vector of 302 features.
Created a vect

In [90]:
training_x = training_matrix[:,:,:-1]
print(training_x.shape)
assert len(training_x.shape) == 3
training_y = training_matrix[:,:,-1]
print(training_y.shape)
assert len(training_y.shape) == 2

from keras.layers import Dense
from keras.layers import LSTM
from keras.models import Sequential

model = Sequential()
#word_embeddings = Embedding(len(vocab), embedding_vector_length, input_length=max_review_length)
#model.add(word_embeddings)
# model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
# input shape is a pair of (timesteps, features) <https://stackoverflow.com/a/44583784/1391325>
input_shape = training_x.shape[1:]
print("Input shape: {}".format(input_shape), file=sys.stderr)
units = training_y.shape[1]
print("Units: {}".format(units), file=sys.stderr)
lstm = LSTM(input_shape=input_shape, units=units)
#lstm = LSTM(batch_input_shape = training_x.shape, stateful = True, units=len(training_y.shape))
model.add(lstm)
model.add(Dense(units, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

(1087, 4, 301)
(1087, 4)


Input shape: (4, 301)
Units: 4


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 4)                 4896      
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 20        
Total params: 4,916
Trainable params: 4,916
Non-trainable params: 0
_________________________________________________________________
None


In [91]:
# train LSTM
#model.fit(x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None)
training_history = model.fit(x=training_x, y=training_y, epochs=1, verbose=1)

# train LSTM <https://machinelearningmastery.com/memory-in-a-long-short-term-memory-network/>
#epochs = 5
#for i in range(epochs):
#    x = training_x[i]
#    x_oldshape = x.shape
#    x_newshape = (-1, x_oldshape[0], x_oldshape[1])
#    x = np.reshape(x, x_newshape)
#    #print("x.shape = {}".format(x.shape), file=sys.stderr)
#    y = training_y[i]
#    y_oldshape = y.shape
#   y_newshape = (-1, y_oldshape[0])
#    y = np.reshape(y, y_newshape)
#    #print("y.shape = {}".format(y.shape), file=sys.stderr)
#    model.fit(x, y, epochs=1, batch_size=1, verbose=1, shuffle=False)
#    model.reset_states()
    
    

Epoch 1/1


In [None]:
# test LSTM


test_x = test_matrix[:,:,:-1]
print(test_x.shape)
assert len(test_x.shape) == 3
test_y = test_matrix[:,:,-1]
print(test_y.shape)
assert len(test_y.shape) == 2

seq_predicted_values = model.predict(test_x, verbose=0)
model.reset_states()
print("result.shape = {}".format(seq_predicted_values.shape), file=sys.stderr)
for i, tested_seq in enumerate(test_x):
    print("Tested sequence: {}".format(tested_seq))
    predicted_values = seq_predicted_values[i]
    assert tested_seq.shape[:-1] == predicted_values.shape
    print("Shape slice: {}".format(tested_seq.shape[:-1]))
    #assert tested_seq.shape[:-1] == actual_values.shape
    print("Predicted values: {}".format(predicted_values))
    actual_values = test_y[i]
    assert predicted_values.shape == actual_values.shape
    print("Actual values: {}".format(actual_values))
	#print('X=%.1f y=%.1f, yhat=%.1f' % (seq1[i], seq1[i+1], result[i]))

(1174, 4, 301)
(1174, 4)
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49094486  0.47552741  0.50151092  0.48321131]
Actual values: [ 0.          0.          0.          0.49167632]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48534939  0.47656956  0.49088877  0.47445369]
Actual values: [ 0.          0.          0.          0.44697456]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49174246  0.4788225   0.49900579  0.4859972 ]
Actual values: [ 0.          0.          0.          0.50712497]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 

result.shape = (1174, 4)


Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48804504  0.47930703  0.48911819  0.47756729]
Actual values: [ 0.          0.          0.          0.44661592]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48534939  0.47656956  0.49088877  0.47445369]
Actual values: [ 0.          0.          0.          0.50946684]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49305993  0.46973455  0.50946331  0.48086902]
Actual values: [ 0.          0.          0.          0.51977502]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  

Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.4927693   0.47401679  0.49912718  0.47630286]
Actual values: [ 0.          0.          0.51648367  0.78059024]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  1. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48657864  0.47403249  0.50214821  0.48286796]
Actual values: [ 0.          0.          0.5173134   0.52167818]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48429096  0.47470495  0.48899445  0.46834528]
Actual values: [ 0.          0.          0.50081044  1.        ]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  

 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49495038  0.47537899  0.50604934  0.48438418]
Actual values: [ 0.49286146  0.4249552   0.61411127  0.35992191]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49741066  0.46702933  0.51516366  0.47796854]
Actual values: [ 0.53002463  0.53612057  0.53874682  0.83120224]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49481344  0.47787014  0.50612485  0.48651147]
Actual values: [ 0.55333257  0.48248544  0.46007028  0.9484821 ]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48886964  0.48006117  0.49586654  0.

 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49695063  0.4792811   0.50660908  0.49014133]
Actual values: [ 0.45000042  0.44096912  0.44096912  0.80224498]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.49248204  0.47269991  0.50672507  0.48084867]
Actual values: [ 0.48529415  0.71173696  0.81532638  0.60324978]
Tested sequence: [[ 0.  0.  1. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]]
Shape slice: (4,)
Predicted values: [ 0.48821059  0.47193348  0.50123066  0.47534457]
Actual values: [ 0.50786787  0.54334095  0.55373965  0.43471566]
Tested sequence: [[ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  0.]
 [ 0.  0.  0. ...,  0.  1.  1.]]
Shape slice: (4,)
Predicted values: [ 0.49806383  0.4704679   0.51135737  0.