In [1]:
import os
import platform
import sys

def get_project_root() -> str:
    system = platform.system()
    return r"D:\Users\tcshore\Documents\Projects\Tangrams\Data" if system == "Windows" else "/home/tshore/Projects/tangrams-restricted/Data"

infile_path = os.path.join(get_project_root(), "wordscores.tsv")
print("Will read file \"{}\".".format(infile_path), file=sys.stderr)

Will read file "/home/tshore/Projects/tangrams-restricted/Data/wordscores.tsv".


In [2]:
import csv
import pandas as pd

RESULTS_FILE_CSV_DIALECT = csv.excel_tab
# NOTE: "category" dtype doesn't work with pandas-0.21.0 but does with pandas-0.21.1
__RESULTS_FILE_DTYPES = {"DYAD": "category", "WORD": "category", "IS_TARGET": bool, "IS_OOV": bool,
				 "IS_INSTRUCTOR": bool, "SHAPE": "category", "ONLY_INSTRUCTOR": bool, "WEIGHT_BY_FREQ": bool}

def read_results_file(inpath: str, encoding: str) -> pd.DataFrame:
	print("Reading \"{}\" using encoding \"{}\".".format(inpath, encoding), file=sys.stderr)
	result = pd.read_csv(inpath, dialect=RESULTS_FILE_CSV_DIALECT, sep=RESULTS_FILE_CSV_DIALECT.delimiter,
						 float_precision="round_trip",
						 encoding=encoding, memory_map=True, dtype=__RESULTS_FILE_DTYPES)
	return result

cv_results = read_results_file(infile_path, "windows-1252")

Reading "/home/tshore/Projects/tangrams-restricted/Data/wordscores.tsv" using encoding "windows-1252".


In [3]:
print("Read {} cross-validation results for {} dyad(s).".format(cv_results.shape[0], len(cv_results["DYAD"].unique())),
      file=sys.stderr)
entity_ids = frozenset(cv_results["ENTITY"].unique())
print("Found {} unique entity IDs.".format(len(entity_ids)), file=sys.stderr)

Read 1206840 cross-validation results for 39 dyad(s).
Found 20 unique entity IDs.


In [4]:
def are_all_entities_represented(df: pd.DataFrame, entity_ids) -> bool:
	"""
	Checks if all entities are represented for each individual token in each utterance in the dataframe.
    
	:param df: The dataframe to check.
	:param entity_ids: A collection of all unique entity IDs.
	:return: true iff for each token in each utterance there is one row for each entity ID.
	"""
	utt_toks = df.groupby(
		("CROSS_VALIDATION_ITER", "DYAD", "ROUND", "UTT_START_TIME", "UTT_END_TIME", "WORD", "TOKEN_SEQ_ORDINALITY"), as_index=False)
	print("Found {} utterance tokens for all cross-validations.".format(len(utt_toks)), file=sys.stderr)
	# Check if there is a row for each entity (possible referent) for each token
	return all(utt_toks.apply(lambda group: is_collection_equivalent(group["ENTITY"], entity_ids)))

def is_collection_equivalent(c1, c2) -> bool:
	c1_len = len(c1)
	c2_len = len(c2)
	return c1_len == c2_len and all(elem in c2 for elem in c1)

assert are_all_entities_represented(cv_results, entity_ids)

Found 60342 utterance tokens for all cross-validations.


In [5]:
def find_target_ref_rows(df: pd.DataFrame) -> pd.DataFrame:
    result = df.loc[df["IS_TARGET"] == True]
    result_row_count = result.shape[0]
    complement_row_count = df.loc[~df.index.isin(result.index)].shape[0]
    assert result_row_count + complement_row_count == df.shape[0]
    print("Found {} nontarget rows and {} target rows. Ratio: {}".format(result_row_count, complement_row_count, complement_row_count / float(result_row_count)), file=sys.stderr)
    return result

cv_results = find_target_ref_rows(cv_results)

Found 60342 nontarget rows and 1146498 target rows. Ratio: 19.0


In [13]:
import math
from typing import List, Tuple

import numpy as np

class TokenSequenceFactory(object):

	def __init__(self, seq_len: int):
		self.seq_len = seq_len
		self.iter = 0

	def __call__(self, df: pd.DataFrame) -> Tuple[List[np.array], List[np.array]]:
		"""
		Creates a sequence of sequences of tokens, each representing an utterance, each of which thus causes an "interruption" in the chain
		so that e.g. the first token of one utterance is not learned as dependent on the last token of the utterance preceding it.
		:param df: The DataFrame to process.
		:return: Paired lists of 2D numpy arrays, each representing a sequence of datapoints which represents an utterance and the corresponding scores to predict.
		"""
        # https://stackoverflow.com/a/47815400/1391325
		df.sort_values("TOKEN_SEQ_ORDINALITY", inplace=True)
		sequences = df.groupby(("CROSS_VALIDATION_ITER", "DYAD", "ROUND", "UTT_START_TIME", "UTT_END_TIME", "ENTITY"),
							   as_index=False)
        
		word_seqs = []
		score_seqs = []
		split_seq_scores = sequences.apply(self.__split_rows)
		for word_seq, score_seq in split_seq_scores:
			word_seqs.extend(word_seq)
			score_seqs.extend(score_seq)
		assert all(len(seq) == self.__seq_len for seq in word_seqs) 
		return word_seqs, score_seqs

	def __split_rows(self, df: pd.DataFrame):
		#print(df["IS_TARGET"].unique())
		print("Iter:" + str(self.iter))
		#print(df["CROSS_VALIDATION_ITER"].unique())
		#print(df["WORD"])
		print(df)           
		self.iter += 1
		#max_ordinality = df["TOKEN_SEQ_ORDINALITY"].max()
		#print(max_ordinality)
		#for window_end_ordinality in range(1, row_count + 1):
		#	window_start_ordinality = window_end_ordinality - self.seq_len
		#	preceding_window = df.loc[(df["TOKEN_SEQ_ORDINALITY"] >= window_start_ordinality) & (df["TOKEN_SEQ_ORDINALITY"] <= window_end_ordinality)]
		#	print(preceding_window)
        
		#seq_words = df["WORD"].values
		#seq_scores = df["PROBABILITY"].values
        
		#partition_count = math.ceil(len(seq_words) / self.__seq_len_divisor)
		#split_seq_words = np.array_split(seq_words, partition_count)
		#split_seq_scores = np.array_split(seq_scores, partition_count)
		return split_seq_words, split_seq_scores
    
    
desired_seq_len = 4
print("Splitting token sequences.", file=sys.stderr)
token_seq_factory = TokenSequenceFactory(desired_seq_len)
word_seqs, score_seqs = token_seq_factory(cv_results)
print("Split data into {} token sequences with a maximum sequence length of {}.".format(len(word_seqs),
                                                                                        desired_seq_len),
      file=sys.stderr)

Splitting token sequences.


Iter:0
     CROSS_VALIDATION_ITER                           DYAD  ROUND  \
178                      1  20170328-1224-patrik-testRavi      1   
179                      1  20170328-1224-patrik-testRavi      1   
180                      1  20170328-1224-patrik-testRavi      1   
181                      1  20170328-1224-patrik-testRavi      1   
182                      1  20170328-1224-patrik-testRavi      1   
183                      1  20170328-1224-patrik-testRavi      1   
184                      1  20170328-1224-patrik-testRavi      1   
185                      1  20170328-1224-patrik-testRavi      1   

     ROUND_START_TIME  GAME_SCORE  UTT_START_TIME  UTT_END_TIME  \
178            60.475           0         60.8083       63.0015   
179            60.475           0         60.8083       63.0015   
180            60.475           0         60.8083       63.0015   
181            60.475           0         60.8083       63.0015   
182            60.475           0         60.

NameError: name 'split_seq_words' is not defined

In [None]:
from typing import Sequence

def pad_sequence(word_score_seq: Tuple[np.array, np.array], min_length: int) -> Tuple[np.array, np.array]:
	word_seq, score_seq = word_score_seq
	word_count = len(word_seq)
	assert word_count == len(score_seq)
	length_diff = min_length - word_count
	if length_diff > 0:
		# NOTE: creating an intermediate tuple is necessary
		padding_words = np.full(length_diff, "__PADDING__")
		padded_word_seq = np.concatenate((padding_words, word_seq), axis=0)
		assert len(padded_word_seq) == min_length
		padding_scores = np.full(length_diff, 0.0)
		padded_score_seq = np.concatenate((padding_scores, score_seq), axis=0)
		assert len(padded_score_seq) == min_length
		result = padded_word_seq, padded_score_seq
	else:
		result = word_seq, score_seq

	return result

def pad_sequences(word_score_seqs : Sequence[Tuple[np.array, np.array]], min_length: int):
    for word_score_seq in word_score_seqs:
        print(pad_sequence(word_score_seq, min_length))

            
pad_sequences(zip(word_seqs, score_seqs), desired_seq_len)

In [None]:
for seq in score_seqs:
    print("scoreseq")
    for s in seq:
        print("s")
        for something in s:
            print(something)


for seq in word_seqs:
    print("seq")
    for t in seq:
        print("t")
        for something in t:
            print(something)

In [None]:
import itertools

from sklearn.preprocessing import LabelEncoder

all_words = tuple(itertools.chain(("__PADDING__", ), cv_results["WORD"].values))
print("Converting {} vocabulary entries to integer labels.".format(len(all_words)), file=sys.stderr)
# integer encode <https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/>
label_encoder = LabelEncoder()
label_encoder.fit(all_words)
for word_seq in word_seqs:
    integer_label_seq = label_encoder.transform(word_seq)
    print(integer_label_seq)
#integer_label_seqs = tuple(label_encoder.transform(word_seq) for word_seq in word_seqs)
#print(integer_label_seqs)