In [5]:
from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

import zipfile
import gdown

import statistics as statistics
from collections import OrderedDict

import string
import csv
import io

In [None]:

from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)


from sherlock.features.word_embeddings import initialise_word_embeddings
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk

In [None]:
## preprocessing.py


from sherlock.features.bag_of_characters import extract_bag_of_characters_features
from sherlock.features.bag_of_words import extract_bag_of_words_features
from sherlock.features.word_embeddings import extract_word_embeddings_features
from sherlock.features.paragraph_vectors import infer_paragraph_embeddings_features



In [6]:
## global_state.py

_is_first = True


def is_first() -> bool:
    return _is_first


def set_first():
    global _is_first
    _is_first = False


def reset_first():
    global _is_first
    _is_first = True

In [None]:
## stats_helper.py

def compute_stats(values):
    _min = min(values)
    _max = max(values)
    _sum = sum(values)

    _mean = np.mean(values)

    x = values - _mean

    _variance = np.mean(x * x)

    if _variance == 0:
        _skew = 0
        _kurtosis = -3
    else:
        _skew = np.mean(x ** 3) / _variance ** 1.5
        _kurtosis = np.mean(x ** 4) / _variance ** 2 - 3

    return _mean, _variance, _skew, _kurtosis, _min, _max, _sum


def mode(axis, pre_sorted: bool = False):
    if not pre_sorted:
        axis = sorted(axis)

    _count_max = 1
    _count = 0
    _mode = _current = axis[0]

    for v in axis:
        if v == _current:
            _count = _count + 1
        else:
            if _count > _count_max:
                _count_max = _count
                _mode = _current
            _count = 1
            _current = v

    if _count > _count_max:
        return _current

    return _mode


In [None]:
## helpers.py


# https://stackoverflow.com/questions/10593387/when-do-i-need-to-escape-characters-within-a-regex-character-set-within

# Also include '[' char despite above to prevent the following warning:
# .../sherlock-project/sherlock/features/bag_of_characters.py:38: FutureWarning: Possible nested set at position 1
#   if search(c, char_set):
#
# NOTE: Make sure that each item that is escaped has the escaping here. The file may be regenerated using
#       generate_chars_col() below.
#          sherlock/features/feature_column_identifiers/char_col.tsv
def escape_for_regex(c):
    if c in ('[', ']', '\\', '^', '-'):
        return '\\' + c
    else:
        return c


# Notes:
# 1. form feed ('\f') is whitespace but was not classed as such in the original paper, hence not present below.
# 2. '\' and '^' are appended to the list to maintain original column sequence
CHARACTERS_TO_CHECK = (
        [c for c in string.printable if c not in ('\n', '\v', '\r', '\t', '\\', '^')] + ['\\', '^']
)


# Usage:
# from sherlock.features.helpers import generate_chars_col
# generate_chars_col()
def generate_chars_col():
    idx = 0
    with open("../sherlock/features/feature_column_identifiers/char_col.tsv", "w") as char_col:
        for c in CHARACTERS_TO_CHECK:
            for operation in ('any', 'all', 'mean', 'var', 'min', 'max', 'median', 'sum', 'kurtosis', 'skewness'):
                col_header = f'n_{c}-agg-{operation}'

                char_col.write(f'{idx}\t{col_header}\n')

                idx = idx + 1


# Alternative for ast.literal_eval, but keeps the elements as str. This version is about 5x faster than literal_eval
# in this use case
# parse arrays in the form "['a value', None, 0.89, \"other string\"]"
def literal_eval_as_str(value, none_value=None):
    if value and value[0] == '[' and value[-1] == ']':
        value = value[1:-1]

    if not value:
        return []

    strings = []

    quote = None
    s2 = ''

    for s in value.split(', '):
        if not s:
            strings.append('')
        elif s[0] in ["'", '"']:
            if len(s) > 1 and s[0] == s[-1]:
                strings.append(s[1:-1])
            else:
                if quote is None:
                    quote = s[0]
                elif s[0] == quote:
                    s2 = s2 + s[1:]
                    quote = None
                    strings.append(s2 + s[:-1])
                    s2 = ''

                if len(s) == 1:
                    s2 = ', '
                else:
                    s2 = s2 + s[1:] + ', '
        elif quote is not None:
            if quote == s[-1]:
                quote = None
                strings.append(s2 + s[:-1])
                s2 = ''
            else:
                s2 = s2 + s + ', '
        elif s == 'None':
            strings.append(none_value)
        else:
            strings.append(s)

    return strings


def keys_to_csv(keys):
    """
    Encode a list of strings into an Excel CSV compatible header.

    Wraps all items with double quotes to prevent legitimate values containing a comma from being interpreted as a
    separator, and encodes existing double quotes with two double quotes.
    """
    with io.StringIO() as output:
        writer = csv.writer(output, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(keys)

        return output.getvalue()


In [None]:
## bag_of_characters.py


# Input: a single column in the form of Python list
# Output: ordered dictionary holding bag of character features
def extract_bag_of_characters_features(col_values: list, features: OrderedDict):
    # Create a set of unique chars from the string vectors to quickly test whether to perform expensive
    # processing for any given char
    char_set = set(''.join(col_values))

    for c in CHARACTERS_TO_CHECK:
        value_feature_name = f'n_[{c}]'

        if c in char_set:
            counts = [s.count(c) for s in col_values]

            has_any = any(counts)
        else:
            has_any = False

        if has_any:
            _any = 1
            _all = 1 if all(counts) else 0
            _mean, _variance, _skew, _kurtosis, _min, _max, _sum = compute_stats(counts)
            _median = statistics.median(counts)

            if is_first():
                # the first output needs fully expanded keys (to drive CSV header)
                features[value_feature_name + '-agg-any'] = _any
                features[value_feature_name + '-agg-all'] = _all
                features[value_feature_name + '-agg-mean'] = _mean
                features[value_feature_name + '-agg-var'] = _variance
                features[value_feature_name + '-agg-min'] = _min
                features[value_feature_name + '-agg-max'] = _max
                features[value_feature_name + '-agg-median'] = _median
                features[value_feature_name + '-agg-sum'] = _sum
                features[value_feature_name + '-agg-kurtosis'] = _kurtosis
                features[value_feature_name + '-agg-skewness'] = _skew
            else:
                # subsequent lines only care about values, so we can pre-render a block of CSV. This
                # cuts overhead of storing granular values in the features dictionary
                features[value_feature_name + '-pre-rendered'] = \
                    f'{_any},{_all},{_mean},{_variance},{_min},{_max},{_median},{_sum},{_kurtosis},{_skew}'
        else:
            if is_first():
                # the first output needs fully expanded keys (to drive CSV header)
                features[value_feature_name + '-agg-any'] = 0
                features[value_feature_name + '-agg-all'] = 0
                features[value_feature_name + '-agg-mean'] = 0
                features[value_feature_name + '-agg-var'] = 0
                features[value_feature_name + '-agg-min'] = 0
                features[value_feature_name + '-agg-max'] = 0
                features[value_feature_name + '-agg-median'] = 0
                features[value_feature_name + '-agg-sum'] = 0
                features[value_feature_name + '-agg-kurtosis'] = -3
                features[value_feature_name + '-agg-skewness'] = 0
            else:
                # assign pre-rendered defaults
                features[value_feature_name + '-pre-rendered'] = '0,0,0,0,0,0,0,0,-3,0'


In [3]:
def download_data():
    """Download raw and preprocessed data files.
    The data is downloaded from Google Drive and stored in the 'data/' directory.
    """
    data_dir = "../data/data/"
    zip_filepath = "../data/data.zip"
    print(f"Downloading the raw data into {data_dir}.")

    if not os.path.exists(data_dir):
        print("Downloading data directory.")
        gdown.download(
            url="https://drive.google.com/uc?id=1-g0zbKFAXz7zKZc0Dnh74uDBpZCv4YqU",
            output=zip_filepath,
        )


        print("zip extract11")
        with zipfile.ZipFile(zip_filepath, "r") as zf:
            zf.extractall(data_dir)
            print("zip extract")

    print("Data was downloaded.")
    
download_data()
prepare_feature_extraction()
