In [11]:
%cd /scratch/bruingjde/SNAM2021-code/

import os
import typing

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing
from tqdm.auto import tqdm

import tlp

features = [
  'aa_time_agnostic', 'aa_time_aware', 'na', 'sp'
]

/scratch/bruingjde/SNAM2021-code


In [15]:
def predict(path, features=features):
  index = path.split('/')[1]
  
  # Read in features
  for feature in features:
    filepath = os.path.join(path, 'features', feature) + '.pkl'
    if not os.path.isfile(filepath):
      tqdm.write(f'{filepath} is not present.')
      return
  X = dict()
  for feature in features:
    filepath = os.path.join(path, 'features', feature) + '.pkl'
    X.update(joblib.load(filepath))
  X = pd.DataFrame(X)
  
  # Read in targets
  filepath = os.path.join(path, 'targets_sampled.npy')
  if not os.path.isfile(filepath):
    tqdm.write(f'{filepath} is not present.')
    return
  y = np.load(filepath)
  
  # Fit and predict pipeline
  X_train, X_test, y_train, y_test = (
    sklearn.model_selection.train_test_split(X, y))
  pipe = sklearn.pipeline.make_pipeline(
    sklearn.preprocessing.StandardScaler(),
    sklearn.linear_model.LogisticRegression(max_iter=10000)) # type: ignore
  pipe.fit(X_train, y_train)
  auc = sklearn.metrics.roc_auc_score(
    y_true=y_test, y_score=pipe.predict_proba(X_test)[:,1]) # type: ignore
  
  return index, (pipe, auc), features

In [16]:
predict('data/01')

('01',
 (Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('logisticregression', LogisticRegression(max_iter=10000))]),
  0.8189051234586866),
 ['aa_time_agnostic', 'aa_time_aware', 'na', 'sp'])