In [1]:
import pandas as pd
import numpy as np
import random
import math
import timeit
import itertools
import warnings
import pickle
import feather
import gc
import sys
import os
import matplotlib.pyplot as plt
from os.path import join, isfile
from collections import Counter
from xgboost import XGBClassifier
from fcmeans import FCM
import scipy.stats as stats
from scipy.spatial import distance
from sklearn.metrics.pairwise import pairwise_distances

warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, formatter={'float': lambda x: "{0:0.2f}".format(x)})
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [2]:
mainPath = "../../data"
beacons = join(mainPath, "beacon")
testSets = join("", "test_sets")
models = join(mainPath, "models")
ceuPath = join(beacons, "CEU")
opensnpPath = join(beacons, "OpenSNP")
inferencePath = join("", "inference")

#### STEP 1: Load Beacon, MAF, Reference and other cached variables

In [3]:
features = [ 'EyeColor','HairType','HairColor','TanAbility','Asthma','LactoseIntolerance',
             'EarWax','Freckling','TongueRoller','RingFinger','Intolerance','WidowPeak','ADHD','Acrophobia',
             'FingerHair','Myopia','IrritableBowel','IndexLongerBig','Photoptarmis','Migraine','RhProtein']
with open(join(opensnpPath, "OpenSNP_Phenotype.pickle"), 'rb') as handle:
    pheno = pickle.load(handle)
pheno = pheno[features]
pheno[pheno=="Auburn"] = "Blonde"
pheno[pheno=="Black"] = "Brown"

with open(join(opensnpPath, "MAF.pickle"), 'rb') as handle:
    maf = pickle.load(handle)

with open(join(opensnpPath, "Reference.pickle"), 'rb') as handle:
    reference = pickle.load(handle)
reference = reference.values

with open(join(opensnpPath, "Beacon.pickle"), 'rb') as handle:
    beacon = pickle.load(handle)

with open(join(opensnpPath, "BinaryBeacon.pickle"), 'rb') as handle:
    binary = pickle.load(handle)

# Baseline Classifier

In [4]:
def div(n, d):
    return n / d if d else 0

def rpaCalculate(tp,fp,tn,fn):
    recall = div(tp,(tp+fn)) 
    precision = div(tp,(tp+fp))
    accuracy = div((tp+tn),(tp+fp+tn+fn))
    return recall, precision, accuracy

def getData(phenotype, snp_pos, people_pos=[]):
    if people_pos == []:
        people_pos = np.arange(beacon.shape[1])
    # Find indices of people who has the specified feature
    feature_label = pheno[pheno[phenotype] != "-"][phenotype]
    existing = beacon.iloc[:, people_pos].columns.isin(feature_label.index.values)
    
    # Get training data
    X = binary[:, people_pos][snp_pos][:, existing].T
    Y = feature_label[beacon.iloc[:, people_pos].columns].values
    return X, Y

def performances(person, reconstruction, eval_pos, reference):
    ind = person[eval_pos] != reference[eval_pos]
    tp = np.sum(reconstruction[eval_pos][ind] != reference[eval_pos][ind])
    fn = np.sum(ind) - tp
    fp = np.sum(reconstruction[eval_pos][~ind] != reference[eval_pos][~ind])
    tn = np.sum(~ind) - fp

    return tp, fp, tn, fn

In [5]:
overall_recall = []
recall_snp_count = 0
overall_precision = []
precision_snp_count = 0

### Eye Color

In [None]:
with open("blue_eye.txt") as f:
    blues = f.readlines()
blues = np.array(list(set([x.strip() for x in blues])))

with open("not_blue.txt") as f:
    not_blue = f.readlines()
not_blue = np.array(list(set([x.strip() for x in not_blue])))
not_blue = np.setdiff1d(not_blue,blues)

not_blue = np.intersect1d(not_blue,beacon.index.values)
blues = np.intersect1d(blues,beacon.index.values)

nb_ind = np.where(np.in1d(beacon.index.values, not_blue))[0]
b_ind = np.where(np.in1d(beacon.index.values, blues))[0]
whole_ind = np.concatenate([nb_ind, b_ind])

x, y = getData("EyeColor", whole_ind)
x, y = getData("EyeColor", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Brown")[0], np.arange(0, len(nb_ind)))] = 1
base_recon[np.ix_(np.where(y == "Blue")[0], np.arange(len(nb_ind),len(whole_ind)))] = 1

# Reconstruct the whole genome also
#genome[]

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Hair Color

In [None]:
hair_snps = ["rs12821256","rs12203592","rs1540771","rs35264875","rs3829241","rs12896399", "rs12896399","rs3212379","rs1805005",
             "rs34474212","rs1805006","rs2228479","rs34158934","rs11547464","rs1805007","rs201326893","rs1110400","rs1805008",
             "rs885479","rs555179612","rs200000734","rs1805009","rs368507952"]

hair_snps = list(set(hair_snps))
random.shuffle(hair_snps)
c1 = hair_snps[:len(hair_snps)//2]
c2 = hair_snps[len(hair_snps)//2:]

c1 = np.intersect1d(c1,beacon.index.values)
c2 = np.intersect1d(c2,beacon.index.values)

c1_ind = np.where(np.in1d(beacon.index.values, c1))[0]
c2_ind = np.where(np.in1d(beacon.index.values, c2))[0]
whole_ind = np.concatenate([c1_ind, c2_ind])

x, y = getData("HairColor", whole_ind)
x, y = getData("HairColor", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Blonde")[0], np.arange(0, len(c1_ind)))] = 1
base_recon[np.ix_(np.where(y == "Brown")[0], np.arange(len(c1_ind),len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Asthma

In [None]:
with open("asthma_snps.txt") as f:
    asthma = f.readlines()
asthma = np.array(list(set([x.strip() for x in asthma])))

asthma = np.intersect1d(asthma,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, asthma))[0]

x, y = getData("Asthma", whole_ind)
x, y = getData("Asthma", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Myopia

In [None]:
with open("myopia_snps.txt") as f:
    myopia = f.readlines()
myopia = np.array(list(set([x.strip() for x in myopia])))

myopia = np.intersect1d(myopia,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, myopia))[0]

x, y = getData("Myopia", whole_ind)
x, y = getData("Myopia", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "High")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Photoptarmis
https://www.nature.com/articles/s41598-019-41551-0

In [None]:
with open("photoptarmis_snps.txt") as f:
    photoptarmis = f.readlines()
photoptarmis = np.array(list(set([x.strip() for x in photoptarmis])))

photoptarmis = np.intersect1d(photoptarmis,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, photoptarmis))[0]

x, y = getData("Photoptarmis", whole_ind)
x, y = getData("Photoptarmis", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Migraine

In [None]:
with open("migraine.txt") as f:
    migraine = f.readlines()
migraine = np.array(list(set([x.strip() for x in migraine])))

migraine = np.intersect1d(migraine,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, migraine))[0]

x, y = getData("Migraine", whole_ind)
x, y = getData("Migraine", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Irritiable Bowel

In [None]:
irbow = ["rs62625044", "rs806378", "rs7209436", "rs242924"]
irbow = np.intersect1d(irbow,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, irbow))[0]

x, y = getData("IrritableBowel", whole_ind)
x, y = getData("IrritableBowel", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### ADHD

In [None]:
with open("adhd.txt") as f:
    adhd = f.readlines()
adhd = np.array(list(set([x.strip() for x in adhd])))

adhd = np.intersect1d(adhd,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, adhd))[0]

x, y = getData("ADHD", whole_ind)
x, y = getData("ADHD", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Freckling

In [None]:
freck = ["rs251468","rs4752116", "rs10886142","rs17833789", "rs12259842", "rs10444039","rs10810635"]
freck = np.intersect1d(freck,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, freck))[0]

x, y = getData("Freckling", whole_ind)
x, y = getData("Freckling", whole_ind, test_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

### Hair Type

In [None]:
hair_type = ["rs11803731","rs17646946","rs7349332"]
hair_type = np.intersect1d(hair_type,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, hair_type))[0]

x, y = getData("HairType", whole_ind)
x, y = getData("HairType", whole_ind, test_ind)
print(x.shape)
base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Curly")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

In [None]:
print("Recall= ", np.sum(overall_recall) / recall_snp_count)
print("Precision= ", np.sum(overall_precision) / precision_snp_count)
print("Total Number of inferred SNPs= ",precision_snp_count)

# Correlations

In [6]:
remaining = ['TanAbility','LactoseIntolerance',
             'TongueRoller','Intolerance'
             ,'RhProtein', 'EarWax', 'Acrophobia', "IndexFinger"]
used_features = np.setdiff1d(features, remaining)
used_features

test_ids = pheno[used_features].iloc[np.where(np.sum(pheno != "-", axis = 1) >= len(used_features))[0]].index
test_ids = test_ids.map(str)
test_ind = np.where(beacon.columns.isin(test_ids))[0]
test_binary = binary[:, test_ind]

indices = []
genome = np.zeros(test_binary.shape, dtype=int)
eval_pos = []

In [7]:
with open("blue_eye.txt") as f:
    blues = f.readlines()
blues = np.array(list(set([x.strip() for x in blues])))

with open("not_blue.txt") as f:
    not_blue = f.readlines()
not_blue = np.array(list(set([x.strip() for x in not_blue])))
not_blue = np.setdiff1d(not_blue,blues)

not_blue = np.intersect1d(not_blue,beacon.index.values)
blues = np.intersect1d(blues,beacon.index.values)

nb_ind = np.where(np.in1d(beacon.index.values, not_blue))[0]
b_ind = np.where(np.in1d(beacon.index.values, blues))[0]
whole_ind = np.concatenate([nb_ind, b_ind])
eval_pos.append(whole_ind)

x, y = getData("EyeColor", whole_ind, test_ind)

genome[np.ix_(nb_ind, np.where(y == "Brown")[0])] = 1
genome[np.ix_(b_ind, np.where(y == "Blue")[0])] = 1

In [8]:
hair_snps = ["rs12821256","rs12203592","rs1540771","rs35264875","rs3829241","rs12896399", "rs12896399","rs3212379","rs1805005",
             "rs34474212","rs1805006","rs2228479","rs34158934","rs11547464","rs1805007","rs201326893","rs1110400","rs1805008",
             "rs885479","rs555179612","rs200000734","rs1805009","rs368507952"]

hair_snps = list(set(hair_snps))
hair_snps = np.intersect1d(hair_snps,beacon.index.values)
random.shuffle(hair_snps)

c1 = hair_snps[:len(hair_snps)//2]
c2 = hair_snps[len(hair_snps)//2:]

c1_ind = np.where(np.in1d(beacon.index.values, c1))[0]
c2_ind = np.where(np.in1d(beacon.index.values, c2))[0]
whole_ind = np.concatenate([c1_ind, c2_ind])
eval_pos.append(whole_ind)

x, y = getData("HairColor", whole_ind, test_ind)

# Total Genome
genome[np.ix_(c1_ind, np.where(y == "Blonde")[0])] = 1
genome[np.ix_(c2_ind, np.where(y == "Brown")[0])] = 1

In [9]:
with open("asthma_snps.txt") as f:
    asthma = f.readlines()
asthma = np.array(list(set([x.strip() for x in asthma])))

asthma = np.intersect1d(asthma,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, asthma))[0]
eval_pos.append(whole_ind)

x, y = getData("Asthma", whole_ind, test_ind)

# Total Genome
genome[np.ix_(whole_ind, np.where(y == "Yes")[0])] = 1

In [10]:
with open("myopia_snps.txt") as f:
    myopia = f.readlines()
myopia = np.array(list(set([x.strip() for x in myopia])))

myopia = np.intersect1d(myopia,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, myopia))[0]
eval_pos.append(whole_ind)

x, y = getData("Myopia", whole_ind, test_ind)

# Total Genome
genome[np.ix_(whole_ind, np.where(y == "High")[0])] = 1

In [11]:
with open("photoptarmis_snps.txt") as f:
    photoptarmis = f.readlines()
photoptarmis = np.array(list(set([x.strip() for x in photoptarmis])))

photoptarmis = np.intersect1d(photoptarmis,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, photoptarmis))[0]
eval_pos.append(whole_ind)

x, y = getData("Photoptarmis", whole_ind, test_ind)

# Total Genome
genome[np.ix_(whole_ind, np.where(y == "Yes")[0])] = 1

In [12]:
with open("migraine.txt") as f:
    migraine = f.readlines()
migraine = np.array(list(set([x.strip() for x in migraine])))

migraine = np.intersect1d(migraine,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, migraine))[0]
eval_pos.append(whole_ind)

x, y = getData("Migraine", whole_ind, test_ind)

genome[np.ix_(whole_ind, np.where(y == "Yes")[0])] = 1

In [13]:
irbow = ["rs62625044", "rs806378", "rs7209436", "rs242924"]
irbow = np.intersect1d(irbow,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, irbow))[0]
eval_pos.append(whole_ind)

x, y = getData("IrritableBowel", whole_ind, test_ind)

genome[np.ix_(whole_ind, np.where(y == "Yes")[0])] = 1

In [14]:
with open("adhd.txt") as f:
    adhd = f.readlines()
adhd = np.array(list(set([x.strip() for x in adhd])))

adhd = np.intersect1d(adhd,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, adhd))[0]
eval_pos.append(whole_ind)

x, y = getData("ADHD", whole_ind, test_ind)

genome[np.ix_(whole_ind, np.where(y == "Yes")[0])] = 1

In [15]:
freck = ["rs251468","rs4752116", "rs10886142","rs17833789", "rs12259842", "rs10444039","rs10810635"]
freck = np.intersect1d(freck,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, freck))[0]
eval_pos.append(whole_ind)

x, y = getData("Freckling", whole_ind, test_ind)

genome[np.ix_(whole_ind, np.where(y == "Yes")[0])] = 1

In [16]:
hair_type = ["rs11803731","rs17646946","rs7349332"]
hair_type = np.intersect1d(hair_type,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, hair_type))[0]
eval_pos.append(whole_ind)

x, y = getData("HairType", whole_ind, test_ind)
genome[np.ix_(whole_ind, np.where(y == "Curly")[0])] = 1

In [17]:
pos = []
for e in eval_pos:
    pos += list(e)
pos = np.sort(np.unique(pos))

In [18]:
recalls1 = []
precisions1 = []
for i in range(genome.shape[1]):
    reference = np.zeros(genome[:, i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(test_binary[:, i], genome[:, i], pos, reference))
    recalls1.append(r)
    precisions1.append(p)
print("Recall: ", np.mean(recalls1), "\tPrecision: ", np.mean(precisions1))

Recall:  0.3353318371424009 	Precision:  0.2869309852403443


## Add Correlated SNPs

In [23]:
genome2 = genome.copy()

In [24]:
remaining = binary[:, np.setdiff1d(np.arange(beacon.shape[1]), test_ind)]
remaining_snps = np.setdiff1d(np.arange(beacon.shape[0]), pos)
x = 1-pairwise_distances(remaining[pos,:], remaining[remaining_snps,:] , metric = "sokalmichener", n_jobs=-1)

In [25]:
k = 500
recalls = []
precisions = []

for i in range(genome.shape[1]):
    snp_ind = np.where(np.in1d(pos, np.where(genome2[:, i] == 1)[0]))[0]    
    sums = np.sum(x[snp_ind,:], axis=0)
    correlated_snps = sums.argsort()[-k:][::-1]
    genome2[remaining_snps[correlated_snps], i] = 1
    
    # Performance
    e_pos = np.sort(np.concatenate([pos,remaining_snps[correlated_snps] ]))
    reference = np.zeros(genome2[:, i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(test_binary[:, i], genome2[:, i], e_pos, reference))
    recalls.append(r)
    precisions.append(p)
    
    if i % 10 == 0:
        print(i)
        
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions))

0
10
20
30
40
50
60
70
Recall:  0.3353318371424009 	Precision:  0.04024661325768025


In [26]:
rpaCalculate(*performances(test_binary[:, 3], genome2[:, 3], e_pos, reference))

(0.6233766233766234, 0.4067796610169492, 0.863448275862069)

In [None]:
genome2[]

In [None]:
'''
### IndexFinger
infin = ["rs314277"]
infin = np.intersect1d(infin,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, infin))[0]

x, y = getData("IndexLongerBig", whole_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "IndexLonger")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)



### Acrophobia
acrop = ["rs2323266"]
acrop = np.intersect1d(acrop,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, acrop))[0]

x, y = getData("Acrophobia", whole_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Yes")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)


### EarWax
wax = ["rs17822931"]

wax = np.intersect1d(wax,beacon.index.values)
whole_ind = np.where(np.in1d(beacon.index.values, wax))[0]

x, y = getData("EarWax", whole_ind)

base_recon = np.zeros(x.shape, dtype=int)
base_recon[np.ix_(np.where(y == "Wet")[0], np.arange(0, len(whole_ind)))] = 1

recalls = []
precisions = []
for i in range(len(x)):
    reference = np.zeros(x[i].shape, dtype=int)
    r, p, _ = rpaCalculate(*performances(x[i], base_recon[i], np.arange(len(whole_ind)), reference))
    recalls.append(r)
    precisions.append(p)
print("Recall: ", np.mean(recalls), "\tPrecision: ", np.mean(precisions), "\tNumber of SNPs: ", len(whole_ind))

overall_recall.append(np.mean(recalls) * len(whole_ind))
recall_snp_count += len(whole_ind)
overall_precision.append(np.mean(precisions) * len(whole_ind))
precision_snp_count += len(whole_ind)

'''