In [1]:
# Necessary Imports
import numpy as np
import pandas as pd
import ast
import csv
import json
import os
from sklearn.model_selection import KFold, cross_validate

from classification.ElkanotoSVCClassifier import ElkanotoSVCClassifier
from dataPreparation import DataPreparation
from dataPreprocessing import DataPreprocessing

In [2]:
# Global parameters for calculating the data for the csv (and their paths) and for the chosen features
# doPreperation = False
doPreprocessing = True
featureFilePath = "genomesWithFeatures.csv"
# genomeFilePath = "genomes.csv"
# chosenFeatures = { "kmer": [5, 8], "codon_translation": [], "dicodon_translation" : [], "sequence_length" : []}
chosenFeatures = { "kmer": [2, 3, 6, 8]}

In [4]:
if doPreprocessing:
    hosts = [f.name for f in os.scandir("../viral_genomes") if f.is_dir()]
    filesForHosts = {host: [f.name for f in os.scandir("../viral_genomes/" + host) if f.is_file()] for host in hosts}

    with open(featureFilePath, "a") as featureFile:
        dataPreprocessor = DataPreprocessing.DataPreprocessing()
        for host in hosts:
            for filename in filesForHosts[host]:
                with open("../viral_genomes/" + host + "/" + filename, "r") as file:
                    lines = file.readlines()
                    sequence = "".join(lines[1:]).replace("\n", "")
                    # flatten feature dict to list
                    features = [x for v in dataPreprocessor.extractFeaturesFromGenome(sequence, chosenFeatures).values() for x in v]
                    # write as csv
                    featureFile.write(host + "," + filename.split(".")[0] + "," + ",".join(map(str, features)) + "\n")

In [6]:
# # Data Preparation of Training Data
# if doPreperation:
#     dataPreperator = DataPreparation.DataPreparation()
#     trainingGenomes = dataPreperator.loadTrainingGenomeSequences()
#     with open(genomeFilePath, mode="w", newline="") as file:
#       writer = csv.writer(file)
#       writer.writerow(["HostTaxID", "GenomeTaxID", "Sequence"])
#       for genome in trainingGenomes:
#           writer.writerow(genome)

# trainingGenomeData = pd.read_csv(genomeFilePath)
# trainingGenomeData = list(trainingGenomeData.itertuples(index=False, name=None))

In [10]:
# # Preprocessing of Training Data
# if doPreprocessing:
#     dataPreprocessor = DataPreprocessing.DataPreprocessing()
#     featureDataframe = dataPreprocessor.calculateFeatureMatrix(trainingGenomeData,chosenFeatures)
#     dataPreprocessor.saveFeatureMatrixAsCSV(featureDataframe, featureFilePath)

# featureData = pd.read_csv(featureFilePath)

# Convert the flat feature list into right format for classifier TODO: Do it better (best case before csv) for efficency
# featureData['FeaturesFlat'] = featureData['FeaturesFlat'].apply(ast.literal_eval)
# print("Phase2")
# featureData['FeaturesFlat'] = featureData['FeaturesFlat'].apply(lambda x: [float(i) for i in x])

In [4]:
# Training of the PU-Classifier

# Define hosts used to read files and hosts to build a classifier for
hosts = [10090, 9534]
hostsToClassify = [10090]

classifiers = {}
for host in hostsToClassify:
    classifiers[host] = ElkanotoSVCClassifier()

# Create dictionary of HostTaxIDs (as key) and empty lists (of feature lists for each datapoint) for each host (as value)
featuresPerHost = {}
for host in hosts:
    featuresPerHost[host] = []

# Extract features of virus sequences for each host in seperate list and convert them to a np array
with open(featureFilePath) as file:
    for line in file:
        values = line.rstrip().split(",")
        host = int(values[0])
        if host in hosts:
            featuresPerHost[host].append(list(map(float, values[2:])))

# for _, row in featureData.iterrows():
#     host = row['HostTaxID']
#     features = np.array(row['FeaturesFlat'])
#     featuresPerHost[host].append(features)

for host in hosts:
    featuresPerHost[host] = np.array(featuresPerHost[host])

# Transform features (X) and host classes (y) into expected format of PU Classifier and run k-fold cross validation for each host
X = np.concatenate([featuresPerHost[host] for host in hosts])

def getYForHost(h):
    y = []
    for host in hosts:
        y += [1 if host == h else 0] * len(featuresPerHost[host])
    return np.array(y)

scores = {}
for host in hostsToClassify:
    # load the training data
    X = X
    y = getYForHost(host)

    # run k-fold cross validation
    cv = KFold(n_splits=4, shuffle=True)
    #cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    scores[host] = cross_validate(classifiers[host], X, y, cv=cv, scoring=['balanced_accuracy', 'recall', 'precision', 'f1'])

NameError: name 'featureData' is not defined

In [12]:
# Evaluation of the Training Process

# print out the metrics we want to see using the scores
scores

{10090: {'fit_time': array([2.15569925, 2.35063291, 1.6938839 , 1.19818091]),
  'score_time': array([0.19839478, 0.28084207, 0.21902609, 0.19310904]),
  'test_balanced_accuracy': array([0.5, 0.5, 0.5, 0.5]),
  'test_recall': array([1., 1., 1., 1.]),
  'test_precision': array([0.82352941, 0.95588235, 0.85294118, 0.82089552]),
  'test_f1': array([0.90322581, 0.97744361, 0.92063492, 0.90163934])},
 9534: {'fit_time': array([1.057585  , 1.66847277, 1.07855105, 1.11113286]),
  'score_time': array([0.21942401, 0.29591918, 0.20665598, 0.21692204]),
  'test_balanced_accuracy': array([0.5, 0.5, 0.5, 0.5]),
  'test_recall': array([1., 1., 1., 1.]),
  'test_precision': array([0.13235294, 0.05882353, 0.20588235, 0.14925373]),
  'test_f1': array([0.23376623, 0.11111111, 0.34146341, 0.25974026])}}

In [None]:
# Visualization of the results