In [1]:
import pandas as pd
import numpy as np
import random
import math
import timeit
import itertools
import warnings
import seaborn as sn
import pickle
import feather
import gc
import sys
import matplotlib.pyplot as plt
from os.path import join, isfile
from collections import Counter, OrderedDict
from xgboost import XGBClassifier
from fcmeans import FCM
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [7]:
mainPath = "../../GenomeReconstruction"
beacons = join(mainPath, "beacon")
ceuPath = join(beacons, "CEU")
models = join(mainPath, "models")
opensnpPath = join(beacons, "OpenSNP")

In [None]:
%%time
beacon = feather.read_dataframe(join(opensnpPath, "Beacon_.ftr"))
beacon.set_index("rs_id", inplace=True)

#### Clean

In [None]:
# Bad Shape
beacon.values[beacon.values == "      AA"] = "AA"
beacon.values[beacon.values == "AA      \"CURLY HAIR\""] = "AA"
beacon.values[beacon.values == "CC "] = "CC"
beacon.values[beacon.values == "CT "] = "CT"
beacon.values[beacon.values == "      AG"] = "AG"
# Single ones
beacon.values[beacon.values == "A"] = "AA"
beacon.values[beacon.values == "C"] = "CC"
beacon.values[beacon.values == "T"] = "TT"
beacon.values[beacon.values == "G"] = "GG"
# Symmetric ones
beacon.values[beacon.values == "CA"] = "AC"
beacon.values[beacon.values == "GA"] = "AG"
beacon.values[beacon.values == "GC"] = "CG"
beacon.values[beacon.values == "TA"] = "AT"
beacon.values[beacon.values == "TC"] = "CT"
beacon.values[beacon.values == "TG"] = "GT"
# NN
beacon.values[beacon.values == "I"] = "NN"
beacon.values[beacon.values == "D"] = "NN"
beacon.values[beacon.values == "0"] = "NN"
beacon.values[beacon.values == "00"] = "NN"
beacon.values[beacon.values == "ID"] = "NN"
# Check
all_pos = set(beacon.iloc[:, 0].values)
for i in range(beacon.shape[1]):
    all_pos = all_pos.union(set(beacon.iloc[:, i].values))

#### Encode&Load Categorical

In [None]:
beacon = beacon.astype("category")
beacon.to_pickle(join(opensnpPath, "Beacon.pickle"))
ref_df = pd.DataFrame(data=reference.T[0], index=beacon.index, columns=[0], dtype="category")
with open(join(opensnpPath, "Reference_c.pickle"), 'wb') as handle:
    pickle.dump(ref_df, handle)
    
# Load
with open(join(opensnpPath, "Beacon.pickle"), 'rb') as handle:
    beacon = pickle.load(handle)
with open(join(opensnpPath, "Reference.pickle"), 'rb') as handle:
    reference = pickle.load(handle)

#### Encode&Load Integer Categorical

In [None]:
le = LabelEncoder()
temp = le.fit_transform(list(all_pos))
dictionary = dict(zip(np.array(list(all_pos)), temp))
with open(join(opensnpPath, "encoding.pickle"), 'wb') as handle:
    pickle.dump(dictionary, handle)
encoded_beacon = np.array([le.transform(beacon.iloc[:, i]) for i in range(beacon.shape[1])])
beacon_ = pd.DataFrame(data=encoded_beacon.T, index=beacon.index, columns=beacon.columns, dtype="category")
beacon_.to_pickle(join(opensnpPath, "Beacon.pickle"))

# Load
with open(join(opensnpPath, "Beacon.pickle"), 'rb') as handle:
    beacon_ = pickle.load(handle)
with open(join(opensnpPath, "encoding.pickle"), 'rb') as handle:
    enc = pickle.load(handle)

### Fix MAF

In [None]:
with open(join(opensnpPath, "MAF.pickle"), 'rb') as handle:
    maf = pickle.load(handle)
maf["chr"] = pd.to_numeric(maf["chr"], downcast="integer")
maf["maf"] = maf[['major_freq','minor_freq']].min(axis=1)
maf["chr"][maf["chr"] == 0] = 1

with open(join(opensnpPath, "MAF.pickle"), 'wb') as handle:
    pickle.dump(maf, handle)

### Get Binary Beacon

In [None]:
%%time
y = np.logical_and(beacon.values != reference.values, beacon.values != "NN")
with open(join(opensnpPath, "BinaryBeacon.pickle"), 'wb') as handle:
    pickle.dump(y, handle, protocol=4)
    
y[beacon == "NN"] = -1
with open(join(opensnpPath, "TernaryBeacon.pickle"), 'wb') as handle:
    pickle.dump(r1, handle, protocol=4)

In [4]:
with open(join(opensnpPath, "BinaryBeacon.pickle"), 'rb') as handle:
    r1 = pickle.load(handle)

# Seperate Homozygous and Heterozygous

In [19]:
# Create Reference
def getHeterozygous(maf):
    greater = maf.loc[maf['major_freq'] < maf['minor_freq']]
    equal   = maf.loc[maf['major_freq'] == maf['minor_freq']]
    smaller = maf.loc[maf['major_freq'] > maf['minor_freq']]

    greater["normal"] = greater['major'] + "" + greater['major']
    equal["normal"] = equal['major'] + "" + equal['major'] #TODO
    smaller["normal"] = smaller['minor'] + "" + smaller['minor']
    
    x = pd.concat([greater,equal,smaller], axis=0)
    x = x.sort_values(by=['rs_id'])
    reference = x["normal"].values
    reference = np.expand_dims(reference, axis=1)
    return reference

doubleminor = getHeterozygous(maf)

In [24]:
data = np.zeros(beacon.shape, dtype=np.int8)
data[beacon==doubleminor] = 1
data[beacon==reference] = 2
data[beacon=="NN"] = 3
beacon_ = pd.DataFrame(data=data, index=beacon.index, columns=beacon.columns, dtype="category")

In [48]:
with open(join(opensnpPath, "QuadBeacon.pickle"), 'wb') as handle:
    pickle.dump(beacon_, handle, protocol=4)

In [50]:
gc.collect()

20