In [2]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join, getsize, dirname
from collections import Counter, OrderedDict
from sklearn.preprocessing import LabelEncoder
import shutil
import warnings
import hickle
import h5py
import pickle
import random
import gc
import time
import feather
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
path = "../../../../../../zion/OpenSNP/people"
meta = "../../../../../../zion/OpenSNP/meta"
beacons = "../../../../../zion/OpenSNP/beacon"
main_path = join(beacons, "Main")

In [13]:
with open(join(beacons, "RMAF_3034.pickle"), 'rb') as handle:
    maf = pickle.load(handle)

In [4]:
ind = np.arange(500,3001,500)

for i in ind:
    data = feather.read_dataframe(join(beacons, "fBeacon_"+str(i)+".ftr"))
    data.set_index("rs_id", inplace=True)
    print("Read in: ", data.shape)
    gc.collect()
    data = maf.join(data, how="left")
    gc.collect()
    del data["chr"]
    del data["count"]
    gc.collect()
    data.fillna("NN", inplace=True)
    gc.collect()
    print("Writing in: ", data.shape)
    data.to_feather(join(beacons, "RBeacon_"+str(i)+".ftr"))

array([ 500, 1000, 1500, 2000, 2500, 3000])

In [None]:
ind = np.arange(1000,3001,500)
beacon = feather.read_dataframe(join(beacons, "fBeacon_"+str(500)+".ftr"))
beacon.set_index("rs_id", inplace=True)
gc.collect()

for i in ind:
    data = feather.read_dataframe(join(beacons, "fBeacon_"+str(i)+".ftr"))
    data.set_index("rs_id", inplace=True)
    print("Read in: ", data.shape)
    gc.collect()
    beacon = pd.concat([beacon, data], axis=1)
    gc.collect()

print("Writing in: ", data.shape)
beacon.reset_index(inplace=True)
beacon.to_feather(join(beacons, "Beacon.ftr"))

In [None]:
######################################################################################################################

In [6]:
beacon = feather.read_dataframe(join(beacons, "Beacon.ftr"))
beacon.set_index("rs_id", inplace=True)

In [14]:
print(beacon.shape)
print(maf.shape)

(2338573, 2979)
(2338573, 2)


In [15]:
maf["major"] = "-"
maf["major_freq"] = 0
maf["minor"] = "-"
maf["minor_freq"] = 0

In [16]:
%%time
def calculate(item):
    line = ''.join(item).replace("N","")
    return line
res = np.apply_along_axis(calculate, 1, beacon.values)

def foo(item):
    return list(map(lambda c2: c2, item[0]))
res = res.reshape(res.shape[0],1)
res = [foo(res[i]) for i in range(len(res))]

result = [list(Counter(e).items()) for e in res]

result = np.array(result)
result.shape

CPU times: user 21min 47s, sys: 2min 2s, total: 23min 49s
Wall time: 23min 33s


In [17]:
%%time
maf["major"] = [i[0][0] if i else "-" for i in result ]
maf["major_freq"] = [i[0][1] if i else 0 for i in result ]
maf["minor"] = [i[1][0] if len(i) > 1 else "-" for i in result ]
maf["minor_freq"] = [i[1][1] if len(i) > 1 else 0 for i in result ]

t = maf["major_freq"] + maf["minor_freq"]
maf["major_freq"] = maf["major_freq"] / t
maf["minor_freq"] = maf["minor_freq"] / t
maf = maf.fillna(0)
maf

CPU times: user 21.1 s, sys: 1.03 s, total: 22.1 s
Wall time: 4.42 s


In [18]:
# Create Reference
def getReference(maf):
    greater = maf.loc[maf['major_freq'] > maf['minor_freq']]
    equal   = maf.loc[maf['major_freq'] == maf['minor_freq']]
    smaller = maf.loc[maf['major_freq'] < maf['minor_freq']]

    greater["normal"] = greater['major'] + "" + greater['major']
    equal["normal"] = equal['major'] + "" + equal['major'] #TODO
    smaller["normal"] = smaller['minor'] + "" + smaller['minor']
    
    x = pd.concat([greater,equal,smaller], axis=0)
    x = x.sort_values(by=['rs_id'])
    reference = x["normal"].values
    reference = np.expand_dims(reference, axis=1)
    return reference

reference = getReference(maf)

In [19]:
maf.to_pickle(join(beacons, "MAF.pickle"))

In [20]:
with open(join(beacons, "Reference.pickle"), 'wb') as handle:
    pickle.dump(reference, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
######################################################################################################################

In [None]:
def findUserIndex(fileName):
    fileName = fileName[4:]
    return int(fileName.split("_")[0])

def findFileIndex(fileName):
    return int(fileName.split("_")[1][4:])

def findSkipCount(fileName):
    filePath = join(path, fileName)
    with open(filePath, "r") as f:
        i = 0
        for line in f:
            if line[0] == "#" or line[0] == " ":
                i += 1
            else:
                if line[0:6] == "rs-id":
                    i += 1
                break
        return i

def readClean(data):
    # Remove X,Y,MT chromosomes
    no_x_y = np.logical_and(data["chromosome"] != "X",data["chromosome"] != "Y")
    data = data[np.logical_and(no_x_y, data["chromosome"] != "MT")]
    data = data.fillna("NN")
    data[data == "II"] = "NN"
    data[data == "--"] = "NN"
    data[data == "DD"] = "NN"
    data[data == "DI"] = "NN"
    return data.iloc[np.where(data.iloc[:,[1]] != "NN")[0]]

def readDf(file, rowSkip):
    data = pd.read_csv(join(path, file), sep="\t", header=None, skiprows=rowSkip)
    data.columns = ['rs_id', 'chromosome', 'position', 'allele']
    del data['position']
    data = data.set_index('rs_id')
    data = data.rename(columns={"allele": findUserIndex(file)})
    return data

def readFileComplete(fileName):
    rowSkip = findSkipCount(fileName)
    beacon = readDf(fileName, rowSkip)
    beacon = readClean(beacon)
    return beacon

def mergeClean(beacon):
    beacon = beacon.loc[~beacon.index.duplicated(keep='first')]
    beacon = beacon[pd.to_numeric(beacon['chr'], errors='coerce').notnull()]
    beacon["chr"] = pd.to_numeric(beacon["chr"])
    beacon = beacon.sort_values(by=['chr'])
    beacon = beacon.fillna("NN")
    maf = beacon[['chr']]
    beacon = beacon.drop(['chr'], axis=1)
    t = np.where(np.sum(beacon.values != "NN", axis=1) > 1)[0]
    beacon = beacon.iloc[t]
    ind = []
    for j in range(len(beacon.index)):
        if beacon.index[j][0] == "r":
            ind.append(j)
    ind = np.array(ind)
    beacon = beacon.iloc[ind]
    beacon.columns = beacon.columns.astype(int)
    return beacon, maf

In [None]:
################################################################################################################################

In [None]:
# Trim except .23andme and small genome files
files = np.array([f for f in listdir(path) if isfile(join(path, f))], dtype=str)
types = []
sizes = []
for f in files:
    types.append(f.split(".")[-2])
    sizes.append(getsize(join(path,f)))
types = np.array(types)
sizes = np.array(sizes)
Counter(types)
ind = np.logical_and(types == "23andme", sizes > 15 * 1000000)
files = files[ind]

# Deal with multiple file people, select newest one
user_filename = {}
for f in files:
    user_filename.setdefault(int(findUserIndex(f)),[]).append(f)
multiple_files = {k:v for (k,v) in user_filename.items() if len(v) > 1}

for m in multiple_files:
    f_names = multiple_files.get(m)
    selected = [findFileIndex(item) for item in f_names]
    selected = selected.index(max(selected))
    for i in range(len(f_names)):
        if i != selected:
            index = np.argwhere(files==f_names[i])
            files = np.delete(files, index)

user_filename = {}
for f in files:
    user_filename[int(findUserIndex(f))] = f
user_ind = np.array(list(user_filename.keys()))

In [None]:
################################################################################################################################

In [None]:
# Read phenotype file
with open(join(beacons, "OpenSNP_Phenotype.pickle"), 'rb') as handle:
    pheno = pickle.load(handle)
print(pheno.shape)

# Trim people have less phenotypes than threshold
people_thres = 0
x = np.sum(pheno != "-", axis=1)
pheno = pheno.loc[x >= people_thres]
pheno.shape

files = [v for (k,v) in user_filename.items() if k in pheno.index.values]
len(files)

In [None]:
# Build Beacon
print("Started main beacon build.")
beacon = readFileComplete(files[0])
beacon = beacon.rename(columns={"chromosome": "chr"})
i = 1
while i < len(files):
    start = time.time()
    try:
        data = readFileComplete(files[i])
        beacon = pd.merge(beacon, data, left_index=True, right_index=True, how='outer')
        beacon["chr"].fillna(beacon["chromosome"], inplace=True)
        beacon = beacon.drop("chromosome", axis=1)
    except:
        print("File " + files[i] + " is skipped.\n")
    end = time.time()
    print(str(i) + ". step is completed in " + str(end - start) + " seconds.")

    if i % 100 == 0 or i == len(files) - 1:
        print("Cleaning main beacon started.")
        beacon, maf = mergeClean(beacon)
        print("Cleaned main beacon.")
        # SAVE
        beacon.to_pickle(join(main_path, "tBeacon_main_"+str(i)+".pickle"))
        maf.to_pickle(join(main_path, "tMAF_main_"+str(i)+".pickle"))
        if i != len(files) - 1:
            i+=1
            print("\n" + str(i) + " has started")
            beacon = readFileComplete(files[i])
            beacon = beacon.rename(columns={"chromosome": "chr"})
    i+=1
print("Ended main beacon build.\n")

In [None]:
print([item for item, count in Counter(beacon.index.values).items() if count > 1])

In [None]:
################################################################################################################################

In [None]:
# Join MAF's
with open(join(main_path, "tMAF_main_"+str(3033)+".pickle"), 'rb') as handle:
        maf = pickle.load(handle)
ind = np.arange(100,3001,100)
for i in ind:
    with open(join(main_path, "tMAF_main_"+str(i)+".pickle"), 'rb') as handle:
        data = pickle.load(handle)
    data = data.rename(columns={"chr": "chromosome"})
    maf = pd.merge(maf, data, left_index=True, right_index=True, how='outer')
    maf["chr"] = maf['chr'].fillna(maf['chromosome'])
    del maf["chromosome"]
    print(i, " is completed.")
ind = []
for j in range(len(maf.index)):
    if maf.index[j][0] == "r":
        ind.append(j)
maf = maf.iloc[ind]
ii = np.logical_and(maf["chr"] != 0, maf["chr"] != 25)
ii = np.logical_and(maf["chr"] != 26, ii)
maf = maf[ii]
maf.to_pickle(join(beacons, "OMAF_3031.pickle"))

In [None]:
# Join Beacons
j = 0
block_size = 5
ind = np.arange(500, 3001, 100)
for i in ind:
    if j % block_size == 0:
        j += 1
        with open(join(main_path, "tBeacon_main_" + str(i) + ".pickle"), 'rb') as handle:
            beacon = pickle.load(handle)
        print(" NEW START ", i, " is started --> ", beacon.shape)
        continue

    start = time.time()

    print("", i, " is started --> ", beacon.shape)
    with open(join(main_path, "tBeacon_main_" + str(i) + ".pickle"), 'rb') as handle:
        data = pickle.load(handle)

    cp1 = time.time()
    print("Data is loaded in ", cp1 - start, " seconds")

    beacon = pd.merge(beacon, data, left_index=True, right_index=True, how="outer")

    cp2 = time.time()
    print("Merge is done in ", cp2 - cp1, " seconds")
    j += 1
    if j % block_size == 0:
        print("SAVING MERGINGS ", i)
        beacon.values[beacon.isnull().values] = "NN"
        cp3 = time.time()
        print("Filling NN is done in ", cp3 - cp2, " seconds")
        #beacon.to_pickle(join(beacons, "kBeacon_" + str(i) + ".pickle"))
        beacon.to_parquet(join(beacons, "kBeacon_" + str(i) + ".parquet"), engine='fastparquet')
        print()

In [None]:
%%time
beacon_1400 = pd.read_parquet(join(beacons, "kBeacon_"+str(1400)+".parquet"), engine='fastparquet')

In [None]:
%%time
beacon_1900 = pd.read_parquet(join(beacons, "kBeacon_"+str(1900)+".parquet"), engine='fastparquet')

In [None]:
beacon = pd.merge(beacon, data, left_index=True, right_index=True, how="outer")

In [None]:
beacon.values[beacon.isnull().values] = "NN"

In [None]:
beacon.to_parquet(join(beacons, "kBeacon_" + str(i) + ".parquet"), engine='fastparquet')

In [None]:
import gc
gc.collect()

In [None]:
beacon_1400.shape

In [None]:
# Save to a file
beacon.to_pickle(join(beacons, "Beacon_3031.pickle"))
hickle.dump(beacon, join(beacons, "Beacon_3031.pickle"), mode='w')
maf.to_pickle(join(beacons, "OpenSNP_MAF_3031.pickle"))

In [None]:
ratios2 = np.sum(beacon == "NN", axis=1)
ratios2 = (ratios2 / (beacon.shape[1]/100)).values

In [None]:
plt.figure()
_ = plt.hist(100-ratios, bins=np.arange(0,105,5), alpha=0.5, label='OpenSNP')
plt.title("OpenSNP")
plt.xticks(np.arange(0,105,5))
plt.xlabel("Percentage of Sequenced People")
plt.ylabel("Number of SNP's")
plt.legend(loc="upper left")
plt.show()

In [None]:
plt.figure()
_ = plt.hist(100-ratios2, bins=np.arange(0,105,5), alpha=0.5, label='OpenSNP')
plt.title("OpenSNP")
plt.xticks(np.arange(0,105,5))
plt.xlabel("Percentage of Sequenced People")
plt.ylabel("Number of SNP's")
plt.legend(loc="upper left")
plt.show()

In [None]:
################################################################################################################################

In [None]:
'''
%%time
ind = np.arange(100,3001,100)
for i in ind:
    with open(join(main_path, "tBeacon_main_"+str(i)+".pickle"), 'rb') as handle:
        data = pickle.load(handle)
    print("Loaded data.")
    data = maf.join(data, how='left')
    data.fillna("NN", inplace=True)
    print("NN filled.")
    del data["chr"]
    print("Dropped chr.")
    data.to_pickle(join(beacons, "aBeacon_main"+str(i)+".pickle"))
    print(i, " is dumped, DONE.\n")
    break
'''

In [None]:
'''
with open(join(beacons, "MAF_3034.pickle"), 'rb') as handle:
    maf = pickle.load(handle)
print(maf.shape)

maf["count"] = 0

%%time
data2 = feather.read_dataframe(join(beacons, "fBeacon_3000.ftr"))
data2.set_index("rs_id", inplace=True)
print(data2.shape)

sums = np.sum(data2.values != "NN", axis=1)
temp = np.where(maf.index.isin(data2.index))
col = np.zeros(maf.shape[0])
col[temp] = sums
maf["current"] = col
maf["count"] += maf["current"]
gc.collect()

del maf["current"]
with open(join(beacons, "MAF_3034.pickle"), 'wb') as handle:
    pickle.dump(maf, handle)

maf2 = maf[maf["count"] >= 8]

with open(join(beacons, "RMAF_3034.pickle"), 'wb') as handle:
    pickle.dump(maf2, handle)
'''