In [1]:
import random
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import train_test_split

SEED = np.random.seed(0)
DATA = Path('data')
TARGETS = ['participants', 'interventions', 'outcomes']
SUBSET = 'Train' # 'Test'
N_TRAIN_DOCS = 4500
SAVE_LOC = Path('data/split')

SAVE_LOC.mkdir(exist_ok=True, parents=True)

%load_ext jupyternotify



Error processing line 7 of c:\users\tommy\anaconda3\lib\site-packages\pywin32.pth:

  Traceback (most recent call last):
    File "c:\users\tommy\anaconda3\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named 'pywin32_bootstrap'

Remainder of file ignored


<IPython.core.display.Javascript object>

# Load feature set and labels

In [2]:
FEATURES = ['base', 'pubmedFT', 'pos'] # 'rawFT', 'clust' // base not needed?
feat_paths = [f'data\\features\\{feature}.pkl' for feature in FEATURES]
labels_path = DATA / 'raw' / 'labels.pkl'

print(f"Loading feature set from {', '.join(feat_paths)}")
feats_in = pd.concat([pd.read_pickle(path) for path in feat_paths], axis=1)
      
print(f'Loading labels from {labels_path}')
labels_in = pd.read_pickle(labels_path)
      
# data_mem = sum(sys.getsizeof(i) for i in [X,y]) # slow command
# print(f'Loaded {data_mem / (10**9)} GB')

data = pd.concat([feats_in.drop('Word',axis=1), labels_in],axis=1)

assert not data.columns.duplicated().any()
      
print('Data load complete.')

Loading feature set from data\features\base.pkl, data\features\pubmedFT.pkl, data\features\pos.pkl
Loading labels from data\raw\labels.pkl
Data load complete.


# Train/Test split

Withhold part of the data for evaluation. Create a script for this.

In [3]:
%%notify

feats = [col for col in data.columns if col not in TARGETS]
targets = TARGETS
hold_size = 10 # 493
k_folds = 5 # only simulated for now

doc_ids = list(data.index.unique(level='doc'))

train_val_idx, hold_idx = train_test_split(doc_ids, train_size=N_TRAIN_DOCS, test_size=hold_size)

word = data
test_hold = data.loc[(hold_idx, slice(None)),:] # slow; comment when testing
train_val = data.loc[(train_val_idx, slice(None)),:]

<IPython.core.display.Javascript object>

Hotcode any numerical columns. Note that this may not be the preferred format for all models (for example, Keras can generate sparser embeddings for many-level categorical variables), so skip this step if you do not want a hotcoded dataset.

In [4]:
%%notify

def hotcode(df):
    

    num_cols = df._get_numeric_data().columns
    cat_cols = (set(df.columns) - set(num_cols)) - {'Word'}
    
    print(cat_cols)
    dummies = pd.get_dummies(df[cat_cols])
    
    print('hotcode complete.')
    # assert check that type is numeric for all
    
    return pd.concat([dummies, df[num_cols]], axis=1)

print('hotcoding categorical columns...')
try:
    train_val = hotcode(train_val)
except ValueError:
    print('No categorical values found in data')

hotcoding categorical columns...
set()
No categorical values found in data


<IPython.core.display.Javascript object>

Sample a subset of the indices to downsample the data. This is useful if you need to work on a computer with less memory or want to train faster on a smaller dataset.

In [5]:
n_docs = 1000

doc_ids = data.index.unique('doc').values.tolist()
ds = random.sample(doc_ids, n_docs)
train_ds = train_val.loc[(ds,slice(None)),:]

Make the required directories and save the data.

In [12]:
train_val.to_parquet(SAVE_LOC / 'train.parquet') # add mkdir
test_hold.to_parquet(SAVE_LOC / 'test.parquet')

train_ds.to_parquet(SAVE_LOC / f'train_{n_docs}.parquet')

# Data saving for Clusterings (temporary)

In [7]:
x = [pd.read_parquet(f'data\\features\\PubMed_{k}_10x300.parquet') for k in [4,8,12]]

In [9]:
x = pd.concat(x, axis=1)

In [11]:
lags = [-2,-1,1,2]
for k in [4,8,12]:
    for lag in lags:
        method = 'bfill' if lag > 0 else 'ffill'
        
        key = f'knn_{k}_lag_{str(lag)}'
        
        x[key] = x[f'knn_{k}'].groupby('doc').shift(lag).fillna(method=method)

In [16]:
x = x.astype(int).astype(str)
x

Unnamed: 0_level_0,Unnamed: 1_level_0,knn_4,knn_8,knn_12,knn_4_lag_-2,knn_4_lag_-1,knn_4_lag_1,knn_4_lag_2,knn_8_lag_-2,knn_8_lag_-1,knn_8_lag_1,knn_8_lag_2,knn_12_lag_-2,knn_12_lag_-1,knn_12_lag_1,knn_12_lag_2
doc,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10037531,0,0,1,1,2,1,0,0,5,0,1,1,5,0,1,1
10037531,1,1,0,0,1,2,0,0,0,5,1,1,0,5,1,1
10037531,2,2,5,5,2,1,1,0,5,0,0,1,5,0,0,1
10037531,3,1,0,0,2,2,2,1,5,5,5,0,5,5,5,0
10037531,4,2,5,5,1,2,1,2,0,5,0,5,0,5,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989713,289,2,5,5,2,1,1,2,5,4,4,5,5,8,8,5
9989713,290,1,4,8,2,2,2,1,5,5,5,4,5,5,5,8
9989713,291,2,5,5,1,2,1,2,4,5,4,5,4,5,8,5
9989713,292,2,5,5,1,1,2,1,4,4,5,4,4,4,5,8


In [17]:
x = pd.get_dummies(x)

In [19]:
x.to_parquet('data/features/km_4-8-12_10x300.parquet')

# Create indices for K-fold cross-validation

These might be best saved to a file (and loaded by the model scripts) if cluster columns are used, as the kNN should be loaded to fit a prediction on the features when running models, else it will have seen the validation set data. For now, I'll train kNN on a smaller subsample of the text to see if the method has potential.

In [None]:
For now, I'll 