In [66]:
%matplotlib
import os

import matplotlib.pyplot as plt

import numpy as np
from numpy.random import binomial, normal
import pandas as pd
from scipy import spatial
from sklearn.decomposition import PCA
from math import floor


Using matplotlib backend: Qt5Agg


In [67]:
total_number_of_samples = 1000
proportion_test_traning = 0.20
DISEASE = 'CDf'
DATA_DIR = 'datasets'

size_test = floor(total_number_of_samples * proportion_test_traning)
size_training = total_number_of_samples - size_test
(size_test, size_training)

(200, 800)

In [68]:
def closest_sample(sample, dataset):
    closest_index = spatial.KDTree(dataset).query(sample, k=2)[1][1]
    return dataset[closest_index]

In [69]:
def MUNGE(dataset, size_multiplier, probability_swap, local_variance):
    output = []
    for _ in range(size_multiplier):
        new_dataset = np.copy(dataset)
        for sample in new_dataset:
            t = closest_sample(sample, dataset)
            for i, feature in enumerate(sample):
                if binomial(1, probability_swap) and feature != t[i]:
                    sd = np.abs(feature - t[i])/local_variance
                    ea_prime = normal(feature, sd)
                    feature = normal(ea_prime, sd)
        output.append(new_dataset)
    return np.row_stack(output)            

In [70]:
DISEASE_FOLDER = ''.join(['HS_', DISEASE])
TRAINING_DATA_FILEPATH = os.path.join(DATA_DIR, 'true_data', DISEASE_FOLDER,
                                           ''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_training.txt']))
TEST_DATA_FILEPATH = os.path.join(DATA_DIR, 'true_data', DISEASE_FOLDER,
                                       ''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_test.txt']))
TRAINING_LABELS_FILEPATH = os.path.join(DATA_DIR, 'true_data', DISEASE_FOLDER,
                                             ''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_training_lab.txt']))
TEST_LABELS_FILEPATH = os.path.join(DATA_DIR, 'true_data', DISEASE_FOLDER,
                                         ''.join(['Sokol_16S_taxa_HS_', DISEASE, '_commsamp_test_lab.txt']))

training_sample = pd.read_csv(TRAINING_DATA_FILEPATH, sep='\t', header=0, index_col=0)
training_labels = pd.read_csv(TRAINING_LABELS_FILEPATH, sep='\t', header=None, index_col=None, dtype=np.int)

test_sample = pd.read_csv(TEST_DATA_FILEPATH, sep='\t', header=0, index_col=0)
test_labels = pd.read_csv(TEST_LABELS_FILEPATH, sep='\t', header=None, index_col=None, dtype=np.int)

In [71]:
training_sample['labels'] = training_labels.values
test_sample['labels'] = test_labels.values
samples = pd.concat([training_sample, test_sample], axis=0)
names_features = samples.columns

In [72]:
healty = samples[(samples['labels'] == 0).values].values[:, :-1]
sick = samples[(samples['labels'] == 1).values].values[:, :-1]

In [73]:
number_healty = healty.shape[0]
number_sick = sick.shape[0]
prop_healty = number_healty / (number_healty + number_sick)
prop_sick = number_sick / (number_healty + number_sick)

In [74]:
new_healty = MUNGE(healty, 11, 0.1, 1)
new_healty = pd.DataFrame(new_healty, columns=names_features[:-1])
new_healty['labels'] = np.zeros(new_healty.shape[0], dtype=np.int)

nb_test_healty = floor(new_healty.shape[0] * proportion_test_traning)
nb_training_healty = new_healty.shape[0] - nb_test_healty

training_healty = new_healty.sample(nb_training_healty)
new_healty = new_healty[~new_healty.isin(training_healty).all(1)]
test_healty = new_healty.sample(nb_test_healty)

In [75]:
new_sick = MUNGE(sick, 11, 0.1, 1)
new_sick = pd.DataFrame(new_sick, columns=names_features[:-1])
new_sick['labels'] = np.ones(new_sick.shape[0], dtype=np.int)

nb_test_sick = floor(new_sick.shape[0] * proportion_test_traning)
nb_training_sick = new_sick.shape[0] - nb_test_sick

training_sick = new_sick.sample(nb_test_sick)
new_sick = new_sick[~new_sick.isin(training_sick).all(1)]
test_sick = new_sick.sample(nb_training_sick)

In [76]:
synt_training = pd.concat([training_healty, training_sick], axis=0)
synt_test = pd.concat([test_healty, test_sick], axis=0)

synt_training_sample = synt_training[synt_training.columns.values[:-1]]
synt_training_labels = synt_training[synt_training.columns.values[-1]]

synt_test_sample = synt_test[synt_test.columns.values[:-1]]
synt_test_labels = synt_test[synt_test.columns.values[-1]]

In [77]:
synt_training_sample.to_csv('training_sample.txt', sep='\t')
synt_training_labels.to_csv('training_labels.txt', sep='\t', index=None, header=None)
synt_test_sample.to_csv('test_sample.txt', sep='\t')
synt_test_labels.to_csv('test_labels.txt', sep='\t', index=None, header=None)

In [78]:
synt_samples = pd.concat([new_healty, new_sick], axis=0)

In [79]:
pca = PCA(n_components=2)
pca.fit(synt_samples.values[:-1])
pca_synt_health = pca.transform(new_healty[:-1])
pca_synt_sick = pca.transform(new_sick[:-1])

In [80]:
fig = plt.figure("Generated", figsize=(4, 3))
plt.clf()
plt.scatter(x=pca_synt_health[:, 0], y=pca_synt_health[:, 1], c='b', alpha=0.25)
plt.scatter(x=pca_synt_sick[:, 0], y=pca_synt_sick[:, 1], c='r', alpha=0.25)

<matplotlib.collections.PathCollection at 0x7ff47a3e22e8>

In [81]:
pca.fit(samples.values[:, :-1])
pca_heathy = pca.transform(healty)
pca_sick = pca.transform(sick)

In [82]:
fig = plt.figure("True", figsize=(4, 3))
plt.clf()
plt.scatter(x=pca_heathy[:, 0], y=pca_heathy[:, 1], c='b', alpha=0.25)
plt.scatter(x=pca_sick[:, 0], y=pca_sick[:, 1], c='r', alpha=0.25)

<matplotlib.collections.PathCollection at 0x7ff47a302518>

In [83]:
gaus = normal(size=((2, 20000)))

In [84]:
plt.scatter(x=gaus[0], y=gaus[1])

<matplotlib.collections.PathCollection at 0x7ff47bdf3dd8>

In [85]:
lol = MUNGE(gaus, 5, 0.1, 1)

In [86]:
plt.scatter(x=lol[0], y=lol[1])

<matplotlib.collections.PathCollection at 0x7ff47a365f28>