In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>

In [3]:
'''
pyfunctional==1.4.3
-e .
gdown==4.3.0
nltk==3.4.5
gensim==3.8.0
'''

#import sys
#!{sys.executable} -m pip install --user numpy

from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [None]:
print(f'Started at {datetime.now()}.')

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [None]:
helpers.download_data()

In [None]:
prepare_feature_extraction()

In [None]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '03-retrain-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [None]:
report_memory = False

## Extract features

The input data is assumed to be a dataframe of column values stored as stringed lists "['hello', 'goodbye', 'hi']", stored in a parquet file.

`TODO`: ideally we pickle the dataframe with value lists to avoid parsing these strings into lists again.

In [None]:
timestr = time.strftime("%Y%m%d-%H%M%S")

# Features will be output to the following files
X_test_filename_csv = f'../data/data/processed/test_{timestr}.csv'
X_train_filename_csv = f'../data/data/processed/train_{timestr}.csv'
X_validation_filename_csv = f'../data/data/processed/validation_{timestr}.csv'

X_orm_filename_csv = f'../data/data/processed/ormtrain_{timestr}.csv'

## use preprocessed files 
X_test_filename_csv = f'../data/data/processed/test_20220323-163452.csv'
X_train_filename_csv = f'../data/data/processed/train_20220323-163452.csv'
X_validation_filename_csv = f'../data/data/processed/validation_20220323-163452.csv'

In [None]:
timestr

### PREPARATION

In [None]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

In [None]:
%load_ext line_profiler

In [None]:
# default process creation mode is changed in Python 3.8 to 'spawn' which causes "name not defined" type errors. Force fork 'mode' for now (this only needs to be called once).
# https://bugs.python.org/issue39931
#mp.set_start_method('fork', force=True)

## EXTRACT FEATURES TO CSV

### TEST SET

In [None]:
values = load_parquet_values("../data/data/raw/test_values.parquet")

extract_features_to_csv(X_test_filename_csv, values)

values = None

In [None]:
#extract_features_to_csv(X_test_filename_csv, values)

X_test_filename_csv

In [None]:
print(f'Finished at {datetime.now()}')

### TRAIN SET

In [None]:
values = load_parquet_values("../data/data/raw/train_values.parquet")

extract_features_to_csv(X_train_filename_csv, values)

values = None

In [None]:
print(f'Finished at {datetime.now()}')

### VALIDATION SET

In [None]:
values = load_parquet_values("../data/data/raw/val_values.parquet")

extract_features_to_csv(X_validation_filename_csv, values)

values = None

In [None]:
print(f'Finished at {datetime.now()}')

### Read Locally Processed Features

In [None]:
start = datetime.now()
X_test = pd.read_csv(X_test_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

In [None]:
X_test.head()

In [None]:
start = datetime.now()

X_train = pd.read_csv(X_train_filename_csv, dtype=np.float32)

print(f'Load Features (train) process took {datetime.now() - start} seconds.')

In [None]:
X_train.head()

In [None]:
start = datetime.now()

X_validation = pd.read_csv(X_validation_filename_csv, dtype=np.float32)

print(f'Load Features (validation) process took {datetime.now() - start} seconds.')

In [None]:
X_validation.head()

## Impute NaN values with feature means

In [None]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_train.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

In [None]:
start = datetime.now()

X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)
X_orm.fillna(train_columns_means.iloc[0], inplace=True)

train_columns_means=None

print(f'FillNA process took {datetime.now() - start} seconds.')

In [None]:
start = datetime.now()

X_train.to_parquet('../data/data/processed/train.parquet', engine='pyarrow', compression='snappy')
X_validation.to_parquet('../data/data/processed/validation.parquet', engine='pyarrow', compression='snappy')
X_test.to_parquet('../data/data/processed/test.parquet', engine='pyarrow', compression='snappy')

print(f'Save parquet process took {datetime.now() - start} seconds.')

In [None]:
print(f'Completed at {datetime.now()}.')

In [None]:
X_orm.to_parquet('../data/data/processed/orm.parquet', engine='pyarrow', compression='snappy')