In [1]:
# Necessary Imports
import numpy as np
import pandas as pd
import ast
import csv
import os
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate

from classification.ElkanotoSVCClassifier import ElkanotoSVCClassifier
from dataPreparation import DataPreparation
from dataPreprocessing import DataPreprocessing

In [2]:
# Global parameters for calculating the data for the csv (and their paths) and for the chosen features
doPreperation = False
doPreprocessing = False
featureFilePath = "genomeFeatures.csv"
genomeFilePath = "genomes.csv"
chosenFeatures = { "kmer": [2, 3, 7], "codon_translation": [], "dicodon_translation" : [], "sequence_length" : []}

In [3]:
# Data Preparation of Training Data
if doPreperation:
    dataPreperator = DataPreparation.DataPreparation()
    trainingGenomes = dataPreperator.loadTrainingGenomeSequences()
    with open(genomeFilePath, mode="w", newline="") as file:
      writer = csv.writer(file)
      writer.writerow(["HostTaxID", "GenomeTaxID", "Sequence"])
      for genome in trainingGenomes:
          writer.writerow(genome)

trainingGenomeData = pd.read_csv(genomeFilePath)
trainingGenomeData = list(trainingGenomeData.itertuples(index=False, name=None))

In [4]:
# Preprocessing of Training Data
if doPreprocessing:
    dataPreprocessor = DataPreprocessing.DataPreprocessing()
    featureDataframe = dataPreprocessor.calculateFeatureMatrix(trainingGenomeData,chosenFeatures)
    dataPreprocessor.saveFeatureMatrixAsCSV(featureDataframe, featureFilePath)

featureData = pd.read_csv(featureFilePath)

# Convert the flat feature list into right format for classifier TODO: Do it better (best case before csv)
featureData['FeaturesFlat'] = featureData['FeaturesFlat'].apply(ast.literal_eval)
featureData['FeaturesFlat'] = featureData['FeaturesFlat'].apply(lambda x: [float(i) for i in x])

In [5]:
# Training of the PU-Classifier

# Define Hosts (with their HostTaxID) for which a model should be build and create a untrained classifier for each host
hosts = [10090, 9534]
classifiers = {}
for host in hosts:
    classifiers[host] = ElkanotoSVCClassifier()

# Create dictionary of HostTaxIDs (as key) and empty lists (of feature lists for each datapoint) for each host (as value)
featuresPerHost = {}
for host in hosts:
    featuresPerHost[host] = []

# Extract features of virus sequences for each host in seperate list and convert them to a np array
for _, row in featureData.iterrows():
    host = row['HostTaxID']
    features = np.array(row['FeaturesFlat'])
    featuresPerHost[host].append(features)

for host in hosts:
    featuresPerHost[host] = np.array(featuresPerHost[host])

# Transform features (X) and host classes (y) into expected format of PU Classifier and run k-fold cross validation for each host
X = np.concatenate([featuresPerHost[host] for host in hosts])

def getYForHost(h):
    y = []
    for host in hosts:
        y += [1 if host == h else 0] * len(featuresPerHost[host])
    return np.array(y)

# TODO: WHere is the training? -> Where are classifiers saved? Are multiple classifier trained?

scores = {}
for host in hosts:
    # load the training data
    X = X
    y = getYForHost(host)

    # run k-fold cross validation
    cv = KFold(n_splits=4, shuffle=True)
    #cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    scores[host] = cross_validate(classifiers[host], X, y, cv=cv, scoring=['balanced_accuracy', 'recall', 'precision', 'f1'])

In [6]:
# Evaluation of the Training Process

# print out the metrics we want to see using the scores
scores

{10090: {'fit_time': array([0.89850092, 0.83758998, 0.55769205, 0.47517991]),
  'score_time': array([0.16909909, 0.13218093, 0.06457686, 0.05828786]),
  'test_balanced_accuracy': array([0.5, 0.5, 0.5, 0.5]),
  'test_recall': array([1., 1., 1., 1.]),
  'test_precision': array([0.86764706, 0.85294118, 0.89705882, 0.8358209 ]),
  'test_f1': array([0.92913386, 0.92063492, 0.94573643, 0.91056911])},
 9534: {'fit_time': array([0.36164403, 0.46575809, 0.45738912, 0.41481328]),
  'score_time': array([0.07598686, 0.09598398, 0.07301188, 0.05946398]),
  'test_balanced_accuracy': array([0.51754386, 0.5       , 0.5       , 0.5       ]),
  'test_recall': array([1., 1., 1., 1.]),
  'test_precision': array([0.16666667, 0.16176471, 0.11764706, 0.10447761]),
  'test_f1': array([0.28571429, 0.27848101, 0.21052632, 0.18918919])}}

In [None]:
# Visualization of the results