In [1]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join, getsize, dirname
from collections import Counter, OrderedDict
import shutil
import warnings
import h5py
import pickle
import random
warnings.filterwarnings('ignore')
path = "../../../../../zion/OpenSNP/people"
meta = "../../../../../zion/OpenSNP/meta"
phenotype = "../../../../../zion/OpenSNP/phenotypes"
beacon_path = "../../../../../zion/OpenSNP/beacon"

In [2]:
with open(join(beacon_path, "SNP_Info.pickle"), 'rb') as handle:
    maf = pickle.load(handle)

In [26]:
# Select phenotype features to be used
remainingFeatures = ['EyeColor',
            'ColorBlindness',
            'HairType',
            'HairColor',
            'TanAbility',
            'Asthma',
            'LactoseIntolerance',
            'BloodType',
            'EarWax',
            'Freckling',
            'TongueRoller',
            'RingFinger',
            'BeardColor',
            'Intolerance']
remainingFeatures

['EyeColor',
 'ColorBlindness',
 'HairType',
 'HairColor',
 'TanAbility',
 'Asthma',
 'LactoseIntolerance',
 'BloodType',
 'EarWax',
 'Freckling',
 'TongueRoller',
 'RingFinger',
 'BeardColor',
 'Intolerance']

In [None]:
beacon_name = remainingFeatures[0] + ".pickle"
with open(join(phenotype, beacon_name), 'rb') as handle:
    beacon = pickle.load(handle)
beacon.columns = beacon.columns.astype(int)

for i in range(1, len(remainingFeatures)):
    print("Started")
    beacon_name = remainingFeatures[i] + ".pickle"
    with open(join(phenotype, beacon_name), 'rb') as handle:
        data = pickle.load(handle)
    beacon = pd.merge(beacon, data[np.setdiff1d(data.columns, beacon.columns)], left_index=True, right_index=True, how='outer')
    print("Done")

In [None]:
beacon.to_pickle(join(meta, "Beacon_1217.pickle"))

In [31]:
with open(join(beacon_path, "Beacon_1217.pickle"), 'rb') as handle:
    beacon = pickle.load(handle)

In [32]:
print(beacon.shape)
print(maf.shape)

(2151743, 1217)
(2151743, 6)


In [33]:
maf["major"] = "NN"
maf["major_freq"] = 0
maf["minor"] = "NN"
maf["minor_freq"] = 0

In [34]:
%%time
def calculate(item):
    line = ''.join(item).replace("N","")
    return line
res = np.apply_along_axis(calculate, 1, beacon.values)

def foo(item):
    return list(map(lambda c2: c2, item[0]))
res = res.reshape(res.shape[0],1)
res = [foo(res[i]) for i in range(len(res))]

result = [list(Counter(e).items()) for e in res]

result = np.array(result)
result.shape

CPU times: user 9min 21s, sys: 1min 16s, total: 10min 38s
Wall time: 10min 31s


In [35]:
%%time
maf["major"] = [i[0][0] if i else "N" for i in result ]
maf["major_freq"] = [i[0][1] if i else 0 for i in result ]
maf["minor"] = [i[1][0] if len(i) > 1 else "N" for i in result ]
maf["minor_freq"] = [i[1][1] if len(i) > 1 else 0 for i in result ]

t = maf["major_freq"] + maf["minor_freq"]
maf["major_freq"] = maf["major_freq"] / t
maf["minor_freq"] = maf["minor_freq"] / t
maf = maf.fillna(0)
maf["chr"] = maf["chr"].astype(int)
maf["maf"] = maf[['minor_freq','major_freq']].min(axis=1)
maf

CPU times: user 20.7 s, sys: 656 ms, total: 21.3 s
Wall time: 3.27 s


In [24]:
maf.to_pickle(join(beacon_path,"MAF_1217.pickle"))

In [37]:
# Create Reference
def getReference(maf):
    greater = maf.loc[maf['major_freq'] > maf['minor_freq']]
    equal   = maf.loc[maf['major_freq'] == maf['minor_freq']]
    smaller = maf.loc[maf['major_freq'] < maf['minor_freq']]

    greater["normal"] = greater['major'] + "" + greater['major']
    equal["normal"] = equal['major'] + "" + equal['major'] #TODO
    smaller["normal"] = smaller['minor'] + "" + smaller['minor']
    
    x = pd.concat([greater,equal,smaller], axis=0)
    x = x.sort_values(by=['rs_id'])
    reference = x["normal"].values
    reference = np.expand_dims(reference, axis=1)
    return reference

reference = getReference(maf)
nn = np.full(reference.shape, 'NN')

In [40]:
with open(join(beacon_path, "Reference_1217.pickle"), 'wb') as handle:
    pickle.dump(reference, handle, protocol=pickle.HIGHEST_PROTOCOL)