In [27]:
import pandas as pd
import pickle
import numpy as np
import torch

import plotly.express as px
import matplotlib.pyplot as plt

import sys
sys.path.append('..')


from configs.split_config import TG_SUPERPOP_DICT, ethnic_background_name_map

In [5]:
# Arguments for projecting into TG PC space with plink

# plink2 --extract variants --out ukb_1kg_projections --pfile /gpfs/gpfs0/ukb_data/plink/plink --read-freq tg_pca.acount --score tg_pca.eigenvec.allele 2 5 variance-standardize --score-col-nums 6-25

In [6]:
UKB_TG_PROJECTIONS_PATH = '/trinity/home/s.mishra/tg_pca/ukb_1kg_projections.sscore'

TG_ANCESTRY_MODEL_PATH = '/trinity/home/s.mishra/tg_pca/tg_pca.pkl'
SUPERPOPULATIONS_OUTPUT_PATH = '/trinity/home/s.mishra/uk-biobank/superpopulations.csv'

In [7]:
df_tg = pd.read_table('/trinity/home/s.mishra/tg_pca/tg_pca.sscore')
df_tg = df_tg.drop(['ALLELE_CT', 'NAMED_ALLELE_DOSAGE_SUM'], axis=1)
df_tg['pop'] = 'tg'
df_ukb = pd.read_table(UKB_TG_PROJECTIONS_PATH)
df_ukb = df_ukb.drop(['#FID', 'ALLELE_CT', 'NAMED_ALLELE_DOSAGE_SUM'], axis=1)[::10]
df_ukb['pop'] = 'ukb'
df_ukb.columns = df_tg.columns
df = pd.concat([df_ukb, df_tg])
px.scatter(df, x='PC1_AVG', y='PC2_AVG', color='pop')

In [8]:
superpopulations_map = {
    'EUR': 0,
    'SAS': 1,
    'AFR': 2,
    'EAS': 3,
    'AMR': 4
}

In [9]:
ukb_tg_projections = pd.read_table(UKB_TG_PROJECTIONS_PATH)
X = ukb_tg_projections.filter(like="_AVG").values
ancestry_model = pickle.load(open(TG_ANCESTRY_MODEL_PATH, 'rb'))
ancestry_model.eval()
pred_probs = ancestry_model.forward(torch.Tensor(X)).detach().numpy()

populations = list(sorted(TG_SUPERPOP_DICT.keys()))
ukb_tg_projections['pred_ancestry'] = np.vectorize(lambda index: populations[index])(np.argmax(pred_probs, axis=1))
ukb_tg_projections['pred_superpop'] = ukb_tg_projections.pred_ancestry.map(TG_SUPERPOP_DICT)
ukb_tg_projections['node_index'] = ukb_tg_projections.pred_superpop.map(superpopulations_map)
ukb_tg_projections.loc[:, ['IID', 'node_index']].to_csv(SUPERPOPULATIONS_OUTPUT_PATH, index=False)

In [10]:
ukb_tg_projections.pred_ancestry.value_counts()

CEU    315334
GBR    123389
IBS     20257
TSI      6063
PJL      5417
ACB      4607
ASW      2635
GIH      2166
YRI      1337
ITU      1159
BEB      1111
KHV       995
CHS       875
CHB       517
LWK       495
STU       456
MXL       398
FIN       303
JPT       271
MSL       166
CDX       111
CLM        89
PEL        74
GWD        65
PUR        55
ESN        32
Name: pred_ancestry, dtype: int64

In [11]:
ukb_tg_projections.pred_superpop.value_counts()

EUR    465346
SAS     10309
AFR      9337
EAS      2769
AMR       616
Name: pred_superpop, dtype: int64

In [None]:
px.scatter(ukb_tg_projections, x='SCORE1_AVG', y='SCORE2_AVG', color='pred_ancestry').write_html('ancestries_pc1v2.html')
px.scatter(ukb_tg_projections, x='SCORE3_AVG', y='SCORE4_AVG', color='pred_ancestry').write_html('ancestries_pc3v4.html')
px.scatter(ukb_tg_projections, x='SCORE3_AVG', y='SCORE4_AVG', color='pred_superpop').write_html('superpopulations_pc3v4.html')
px.scatter(ukb_tg_projections, x='SCORE1_AVG', y='SCORE2_AVG', color='pred_superpop').write_html('superpopulations_pc1v2.html')

In [21]:
from preprocess.splitter import SplitBase

df = SplitBase().get_ethnic_background()
df.index = df.IID
ukb_tg_projections.index =  ukb_tg_projections.IID
ukb_tg_projections['sr_ancestry_code'] = df.ethnic_background

In [35]:
for code in ukb_tg_projections.sr_ancestry_code.unique():
    print(f"EB code: {code}")
    if code in ethnic_background_name_map:
        print(f"Ethnicity: {ethnic_background_name_map[code]}")
    print("Value counts:")    
    print(ukb_tg_projections.loc[ukb_tg_projections.sr_ancestry_code == code].pred_ancestry.value_counts())
    print("\n")

EB code: nan
Value counts:
Series([], Name: pred_ancestry, dtype: int64)


EB code: 1001.0
Ethnicity: British
Value counts:
CEU    296520
GBR    115900
IBS     15227
TSI      1987
PJL        98
FIN        23
ASW        22
CHS        17
MXL        11
GIH         5
ITU         3
JPT         2
CDX         2
ACB         1
KHV         1
PUR         1
BEB         1
Name: pred_ancestry, dtype: int64


EB code: 1002.0
Ethnicity: Irish
Value counts:
CEU    8277
GBR    4234
IBS     145
TSI       8
PJL       3
Name: pred_ancestry, dtype: int64


EB code: 1003.0
Ethnicity: Any other white background
Value counts:
CEU    7261
IBS    4043
TSI    2131
GBR    1840
FIN     249
MXL     115
CLM      20
PJL      15
PEL       8
ASW       7
PUR       3
CDX       2
CHS       1
GIH       1
CHB       1
BEB       1
JPT       1
Name: pred_ancestry, dtype: int64


EB code: 4002.0
Ethnicity: African
Value counts:
YRI    985
ACB    977
ASW    562
LWK    368
MSL    134
TSI     73
GWD     49
PUR     19
ESN     17
IBS