In [135]:
# Necessary Imports
import numpy as np
import pandas as pd
import ast
from classification.ElkanotoSVCClassifier import ElkanotoSVCClassifier

In [149]:
# define hosts for which model should be built
hosts = ["hosttaxID1", "hosttaxID2", "hosttaxID3"]

In [150]:
# create a untrained classifier for each host
classifiers = {}
for host in hosts:
    classifiers[host] = ElkanotoSVCClassifier()

In [151]:
# load datasets

# read in file
featureFilePath = "mock_features_preprocessing.csv"
featureData = pd.read_csv(featureFilePath)


# Convert the flat feature list into right format for classifier
featureData['FeaturesFlat'] = featureData['FeaturesFlat'].apply(ast.literal_eval)
featureData['FeaturesFlat'] = featureData['FeaturesFlat'].apply(lambda x: [float(i) for i in x])


# Create dictionary of hostIDs and empty lists (of feature lists for each datapoint) for each host
featuresPerHost = {}
for host in hosts:
    featuresPerHost[host] = []


# Extract features of virus sequences for each host in seperate list and convert them to a np array
for _, row in featureData.iterrows():
    host = row['HostTaxID']
    features = np.array(row['FeaturesFlat'])
    featuresPerHost[host].append(features)

for host in hosts:
    featuresPerHost[host] = np.array(featuresPerHost[host])


In [152]:
# transform features into input data
X = np.concatenate([featuresPerHost[host] for host in hosts])

def getYForHost(h):
    y = []
    for host in hosts:
        y += [1 if host == h else 0] * len(featuresPerHost[host])

    return np.array(y)

In [156]:
# run k-fold cross validation for each host
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
import numpy as np

scores = {}
for host in hosts:
    # load the training data
    X = X
    y = getYForHost(host)
    print("Shape of X:", X.shape)
    print("Length of y:", len(y))
    print(y)


    # run k-fold cross validation
    cv = KFold(n_splits=2, shuffle=True)
    #cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    scores[host] = cross_validate(classifiers[host], X, y, cv=cv, scoring=['balanced_accuracy', 'recall', 'precision'])

Shape of X: (45, 590)
Length of y: 45
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]
Shape of X: (45, 590)
Length of y: 45
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]
Shape of X: (45, 590)
Length of y: 45
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]


In [157]:
# print out the metrics we want to see using the scores
scores

{'hosttaxID1': {'fit_time': array([0.00525308, 0.00247407]),
  'score_time': array([0.00993395, 0.00561476]),
  'test_balanced_accuracy': array([1.        , 0.96666667]),
  'test_recall': array([1., 1.]),
  'test_precision': array([1.   , 0.875])},
 'hosttaxID2': {'fit_time': array([0.00224781, 0.00181127]),
  'score_time': array([0.004673  , 0.00483799]),
  'test_balanced_accuracy': array([1., 1.]),
  'test_recall': array([1., 1.]),
  'test_precision': array([1., 1.])},
 'hosttaxID3': {'fit_time': array([0.00176883, 0.00138807]),
  'score_time': array([0.003901  , 0.00372815]),
  'test_balanced_accuracy': array([1.        , 0.94444444]),
  'test_recall': array([1.        , 0.88888889]),
  'test_precision': array([1., 1.])}}