In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.
# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
%env PYTHONHASHSEED

UsageError: Environment does not have key: PYTHONHASHSEED


# Extract features

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>

In [42]:
'''
pyfunctional==1.4.3
-e .
gdown==4.3.0
nltk==3.4.5
gensim==3.8.0
'''

#import sys
#!{sys.executable} -m pip install --user numpy

from datetime import datetime
import os
import sys
import time

import numpy as np
import pandas as pd

from sherlock import helpers
from sherlock.functional import extract_features_to_csv
from sherlock.features.paragraph_vectors import initialise_pretrained_model, initialise_nltk
from sherlock.features.preprocessing import (
    extract_features,
    convert_string_lists_to_lists,
    prepare_feature_extraction,
    load_parquet_values,
)
from sherlock.features.word_embeddings import initialise_word_embeddings

In [43]:
print(f'Started at {datetime.now()}.')

Started at 2023-03-20 17:29:08.849349.


## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data for preprocessing and model training into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [44]:
# helpers.download_data()
prepare_feature_extraction()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.


In [45]:
if not os.path.exists('../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy'):
    raise SystemExit(
        """
        Trained paragraph vectors do not exist,
        please run the '03-retrain-paragraph-vector-features' notebook before continuing
        """
    )

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

## Report memory usage (can be slow)

In [46]:
report_memory = False

## Extract features

The input data is assumed to be a dataframe of column values stored as stringed lists "['hello', 'goodbye', 'hi']", stored in a parquet file.

`TODO`: ideally we pickle the dataframe with value lists to avoid parsing these strings into lists again.

In [47]:
# SET FILE PATHS

X_filename_csv = f'../data/data/processed/processedX.csv'

### PREPARATION

In [48]:
# ensure embedding initialisation is outside of timing for extract_features
prepare_feature_extraction()
initialise_word_embeddings()
initialise_pretrained_model(400)
initialise_nltk()

Preparing feature extraction by downloading 4 files:
        
 ../sherlock/features/glove.6B.50d.txt, 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy,
        
 ../sherlock/features/par_vec_trained_400.pkl.trainables.syn1neg.npy, and 
 ../sherlock/features/par_vec_trained_400.pkl.wv.vectors.npy.
        
All files for extracting word and paragraph embeddings are present.
Initialising word embeddings
Initialise Word Embeddings process took 0:00:04.878021 seconds.
Initialise Doc2Vec Model, 400 dim, process took 0:00:00.005301 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)
Initialised NLTK, process took 0:00:00.320742 seconds.


[nltk_data] Downloading package punkt to /home/sunnykim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sunnykim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


### Read input file

In [51]:
# csv -> pq -> csv -> pq 

df = pd.read_csv("../data/data/raw/inputX.csv", index_col = [0])
df.to_parquet("../data/data/raw/inputX_pq.parquet")

## EXTRACT FEATURES TO CSV

In [24]:
values = load_parquet_values("../data/data/raw/inputX_pq.parquet")

extract_features_to_csv(X_filename_csv, values)

values = None

Starting ../data/data/processed/processedX.csv at 2023-03-20 16:44:17.120967. Rows=4, using 32 CPU cores
Exporting 1588 column features
Finished. Processed 4 rows in 0:00:00.961981, key_count=1


### Read Locally Processed Features

In [25]:
start = datetime.now()
X_test = pd.read_csv(X_filename_csv, dtype=np.float32)

print(f'Load Features (test) process took {datetime.now() - start} seconds.')

Load Features (test) process took 0:00:00.128780 seconds.


In [26]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.00026,0.000504,0.001107,4.3e-05,0.000379,0.000412,0.000205,-0.000171,0.000661,-0.000717
1,1.0,0.0,0.368421,0.33795,0.0,2.0,0.0,7.0,0.742677,1.326868,...,0.000176,3e-05,-0.000322,0.000328,-0.000179,0.000946,0.000128,0.000599,0.000912,-0.000503
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,8.5e-05,-0.001041,0.000391,6.9e-05,0.000707,0.000756,-0.000561,-0.000595,-0.001213,-0.000172
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,...,-0.000839,0.000855,0.000757,-5.9e-05,-0.000402,3.9e-05,-8.1e-05,-0.000691,-0.000319,-0.0009


## Impute NaN values with feature means

In [30]:
start = datetime.now()

train_columns_means = pd.DataFrame(X_test.mean()).transpose()

print(f'Transpose process took {datetime.now() - start} seconds.')

Transpose process took 0:00:00.002447 seconds.


In [31]:
X_test.fillna(train_columns_means.iloc[0], inplace=True)

In [32]:
X_test.to_parquet('../data/data/processed/inputX.parquet', engine='pyarrow', compression='snappy')

In [39]:
# converting y data to parquet
df = None
df = pd.read_csv("../data/data/raw/inputY.csv", index_col=[0])
df.to_parquet("../data/data/raw/inputY_pq.parquet")
