In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import sample

In [2]:
pob_por_ac = pd.read_csv('../00_csv_files/pob_por_ac_distances.csv')
pob_por_ac.head(2)

Unnamed: 0,eid,center,distances,POB_east,POB_north,POR_east,POR_north,POB_distance,POR_distance,closest_center_POR,POR_distance_closest,POB_distance_closest,closest_center_POB
0,1136349,11004,229901.065678,68500,801500,252000.0,663000.0,234227.01893,7490.4871,11004.0,7490.4871,234227.01893,11004
1,1071275,11004,196729.509734,66500,798500,211000.0,665000.0,234138.707601,48107.415198,11004.0,48107.415198,234138.707601,11004


In [29]:
acs = pd.read_csv('../00_csv_files/assessmentCentreCoordinates.csv')
acs.head(2)

Unnamed: 0,Name,Count,East,North,Code,NUTS_ID,mapColors
0,Middlesborough,21287,449412,520448,11017,UKC12,25211334
1,Newcastle,37004,425606,563642,11009,UKC22,92202135


In [78]:
# how many people are born near ANY assessment center
born_near_any = pob_por_ac[pob_por_ac.POB_distance_closest < 25000].eid.tolist()
print('born near any: ', len(born_near_any))
print(len(pob_por_ac) - len(born_near_any))

born near any:  355364
86572


In [65]:
# make N groups of assessment centers such that in each group,
# we have P percent of all 'born near' people born near these ACs
P = 0.5
N = 10

target_minimum = int(P * len(born_near_any))
print('target minimum: ',target_minimum)
ac_codes = acs.Code.tolist()
born_near_selected_acs = []

selected_acs = []
ac_groups = []

for i in np.arange(N):
    random_group = sample(ac_codes,len(ac_codes))
    for ac in random_group:
        if len(born_near_selected_acs) < target_minimum:
            born_near_this_ac = pob_por_ac[pob_por_ac.closest_center_POB == ac].eid.tolist()
            born_near_selected_acs.extend(born_near_this_ac)
            selected_acs.append(ac)
        else:
            #print('Total for this group: ', len(born_near_selected_acs))
            ac_groups.append(selected_acs)
            born_near_selected_acs = []
            selected_acs = []
            break

for group in ac_groups:
    print(group)

target minimum:  177682
[11022, 11023, 11006, 11003, 11005, 11012, 11004, 11008, 11020, 11014]
[11007, 11006, 11016, 11023, 11003, 11009, 11020, 11008, 11021]
[11017, 11022, 11007, 11023, 11009, 11010, 11004, 11001, 11011, 11020]
[11007, 11003, 11014, 11012, 11020, 11013, 11008, 11010]
[11010, 11018, 11006, 11020, 11023, 11005, 11002, 11014, 11003, 11009]
[11021, 11001, 11009, 11014, 11013, 11018, 11007, 11003, 11016]
[11002, 11010, 11001, 11008, 11020, 11004, 11006, 11013, 11005]
[11018, 11006, 11008, 11017, 11001, 11005, 11009, 11016, 11011]
[11017, 11016, 11005, 11004, 11014, 11006, 11007, 11020, 11002, 11012]
[11020, 11005, 11006, 11021, 11022, 11017, 11011, 11010, 11003]


In [75]:
# for each group, save files of:
# eids for samples born NEAR these ACs
# eids for samples sampled AT these ACs
overlapPercentages = []
for i,group in enumerate(ac_groups):
    print('group: ', str(i+1))
    born_near_these_acs = pob_por_ac[pob_por_ac.closest_center_POB.isin(group)].eid.tolist()
    print('born near these ACs:  ', len(born_near_these_acs))

    sampled_at_these_acs = pob_por_ac[pob_por_ac.center.isin(group)].eid.tolist()
    print('sampled at these ACs: ',len(sampled_at_these_acs))
    overlap = len(set(sampled_at_these_acs) & set(born_near_these_acs))
    print('overlap: ', overlap)
    percentOverlap = overlap / float(len(born_near_these_acs)) * 100
    print('percent overlap: ',percentOverlap)
    overlapPercentages.append(percentOverlap)
print(np.mean(overlapPercentages))

group:  1
born near these ACs:   195336
sampled at these ACs:  152869
overlap:  123512
percent overlap:  63.23053610189622
group:  2
born near these ACs:   195251
sampled at these ACs:  194407
overlap:  143864
percent overlap:  73.68156885240025
group:  3
born near these ACs:   185935
sampled at these ACs:  213346
overlap:  146388
percent overlap:  78.73073923683008
group:  4
born near these ACs:   205967
sampled at these ACs:  197286
overlap:  151937
percent overlap:  73.76764238931479
group:  5
born near these ACs:   195146
sampled at these ACs:  207477
overlap:  149776
percent overlap:  76.75074047123692
group:  6
born near these ACs:   203694
sampled at these ACs:  221706
overlap:  160200
percent overlap:  78.64738283896433
group:  7
born near these ACs:   186213
sampled at these ACs:  192938
overlap:  145008
percent overlap:  77.8721141918126
group:  8
born near these ACs:   198690
sampled at these ACs:  216640
overlap:  162166
percent overlap:  81.61759524888016
group:  9
born ne

In [77]:
# for each group, save files of:
# eids for samples born NEAR these ACs
# eids for samples sampled AT these ACs
overlapPercentages = []
for i,group in enumerate(ac_groups):
    print('group: ', str(i+1))
    born_near_file = 'born_near_group_' + str(i+1) + '.csv'
    sampled_at_file = 'sampled_at_group_' + str(i+1) + '.csv'
    born_near_these_acs = pob_por_ac[pob_por_ac.closest_center_POB.isin(group)].eid.tolist()
    
    o = open(born_near_file,'w')
    for eid in born_near_these_acs:
        o.write(str(eid) + ' ' + str(eid) + '\n')
    o.close()
    
    sampled_at_these_acs = pob_por_ac[pob_por_ac.center.isin(group)].eid.tolist()
    o = open(sampled_at_file,'w')
    for eid in sampled_at_these_acs:
        o.write(str(eid) + ' ' + str(eid) + '\n')
    o.close()

group:  1
group:  2
group:  3
group:  4
group:  5
group:  6
group:  7
group:  8
group:  9
group:  10


In [None]:
# use these files in GWAS - covariates = age, sex, chip, batch
# scp *_group_* igw9@cbsulogin.tc.cornell.edu:/bscb/bscb09/500k_ukb/ian/gwas_files/.
# covar = covar_sex_pcs_batch_chip.txt
# pheno = pheno_eduYears.csv
# dir1a /bscb/bscb09/500k_ukb/ian/gwas_imputed/eduyears_bornAt_1/
# dir1b /bscb/bscb09/500k_ukb/ian/gwas_imputed/eduyears_sampledAt_1/

In [26]:
# compare two lists

def f2list(file):
    d = open(file,'r').readlines()
    ds = [x.rstrip().split()[0] for x in d]
    return ds

for group in np.arange(10):
    group += 1
    born_near_file = 'born_near_group_' + str(group) + '.csv'
    sampled_at_file = 'sampled_at_group_' + str(group) + '.csv'

    born_near = f2list(born_near_file)
    sampled_at = f2list(sampled_at_file)
    b=len(set(born_near) - set(sampled_at))
    o=len(set(born_near) & set(sampled_at))
    s=len(set(sampled_at) - set(born_near))
    # print('unique born:    ', b)
    # print('overlap:        ', o)
    # print('unique sampled: ', s)
    print(','.join([str(x) for x in [group,b,o,s]]))

1,71824,123512,29357
2,51387,143864,50543
3,39547,146388,66958
4,54030,151937,45349
5,45370,149776,57701
6,43494,160200,61506
7,41205,145008,47930
8,36524,162166,54474
9,52181,152206,44912
10,38830,139565,55381
