# Data Setup

This notebook contains code to format pairwise comparisons into dataframes for analysis. 

## Notebook setup

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
BASE_PATH = "/".join(os.getcwd().split("/")) # base directory level


#Wynton HPC
BIN_PATH = os.path.join(BASE_PATH, "bin")  # where my scripts live
DATA_PATH = os.path.join(BASE_PATH, "data")  # where I dump new data 
RESULTS_PATH = os.path.join(BASE_PATH, "results")  # where I analyze results

SRC_PATH = os.path.join(BASE_PATH, "src")  # where any packages needed to run analyses live. I haven't started structuring things this way yet. 

# COMP_PATH = os.path.join(DATA_PATH,"pairwise/hsmrca")
# COMP_PATH = os.path.join(DATA_PATH,"pairwise/reference")


# # # Local
# RESULTS_PATH = os.path.join(BASE_PATH, "../../../downloads")
# DATA_PATH = os.path.join(BASE_PATH, "../../../downloads")  # where I dump new data
# COMP_PATH = os.path.join(DATA_PATH,"1KGvsHSMRCA")
# FIG_PATH = '/Users/egilbertson/Library/CloudStorage/Box-Box/UCSF/CapraLab/3DGenome/figures/manuscript'

# Functions

In [3]:
def write_comp_dfs(base, g, m, s):
    g.to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/%s_genomewide_averages.csv' % base)
    m.to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/%s_window_mse.csv' % base)
    s.to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/%s_window_spearman.csv' % base)
    return

In [22]:
def comp_data_setup(indivs, base):
    comp_dict = {}
    print('first for')
    for ind in indivs['1KG']:
        filename = '3dcomp_%s_vs_%s.txt' % (ind,base)
        if os.path.exists('%s/%s' % (COMP_PATH,filename)):
            try:
                df = pd.read_table('%s/%s' % (COMP_PATH,filename))
                df = df[df.chr != 'chrX']
                comp_dict[ind] = df
            except:
                continue     
    print('done')

    df = comp_dict[list(comp_dict.keys())[0]].set_index(['chr','windowStartPos'])
    windows = df.index
    ### Window by window
    mse = pd.DataFrame(index = windows, columns = list(indivs['1KG']))
    spear = pd.DataFrame(index = windows, columns = list(indivs['1KG']))


    ### Genome wide averages  
    base_df = pd.DataFrame()
    base_df['1KG'] = indivs
    base_df['ancestor'] = base
    base_df = base_df.set_index('1KG')      
    base_df['genome_avg_mse']=''
    base_df['genome_avg_spearman']=''
    print('second for')
    for ind in comp_dict.keys():
        df = comp_dict[ind]

        avg_mse =float(np.mean(df['mse']))
        avg_spearman = float(np.mean(df['spearman']))

        base_df.genome_avg_mse.loc[ind]=avg_mse
        base_df.genome_avg_spearman.loc[ind]=avg_spearman
        
        
        df = comp_dict[ind]
        df = df.set_index(['chr','windowStartPos'])
        mse[ind]=df['mse']
        spear[ind]=df['spearman']
    print('done')  
    

    
    base_df['temp'] = base_df.index
    new = base_df.temp.str.split("_", expand = True)
    base_df['super_pop'] = new[0]
    base_df['sub_pop'] = new[1]
    base_df['sex'] = new[2]
    base_df['id'] = new[3]
    
    genome_wide = base_df.drop(columns=['temp'])

    mse.columns = mse.columns.str.split('_', expand=True)
    spear.columns = spear.columns.str.split('_', expand=True)
    
    return genome_wide, mse, spear

## Formatting pairwise comparisons of 3D similarity for later use

In [68]:
# list of 1KG unrelated individuals
indivs = pd.read_table(os.path.join(DATA_PATH, "reference/1KG_unrelated_indivs.txt"), names=['1KG'], delimiter = ',', header=0)
windows_to_keep = pd.read_csv('%s/intermediates/windows_to_keep.csv' % DATA_PATH, index_col=0)

In [64]:
indivs

Unnamed: 0,1KG
0,AFR_ACB_female_HG01880
1,AFR_ACB_female_HG01883
2,AFR_ACB_female_HG01886
3,AFR_ACB_female_HG01889
4,AFR_ACB_female_HG01894
...,...
2452,SAS_STU_male_HG04039
2453,SAS_STU_male_HG04100
2454,SAS_STU_male_HG04107
2455,SAS_STU_male_HG04210


In [65]:
COMP_PATH = os.path.join(DATA_PATH,"pairwise/hsmrca")
anc_g, anc_m, anc_s = comp_data_setup(indivs, 'hsmrca_ancestral')
anc_g = anc_g[anc_g.genome_avg_spearman!='']
#write_comp_dfs('anc', anc_g, anc_m, anc_s)

first for
done
second for
done


In [70]:
windows_to_keep = anc_s.index.to_frame(index=False)
windows_to_keep.to_csv('%s/intermediates/windows_to_keep.csv' % DATA_PATH)

In [36]:
anc_s

Unnamed: 0_level_0,Unnamed: 1_level_0,AFR,AFR,AFR,AFR,AFR,AFR,AFR,AFR,AFR,AFR,...,SAS,SAS,SAS,SAS,SAS,SAS,SAS,SAS,SAS,SAS
Unnamed: 0_level_1,Unnamed: 1_level_1,ACB,ACB,ACB,ACB,ACB,ACB,ACB,ACB,ACB,ACB,...,STU,STU,STU,STU,STU,STU,STU,STU,STU,STU
Unnamed: 0_level_2,Unnamed: 1_level_2,female,female,female,female,female,female,female,female,female,female,...,male,male,male,male,male,male,male,male,male,male
Unnamed: 0_level_3,Unnamed: 1_level_3,HG01880,HG01883,HG01886,HG01889,HG01894,HG01896,HG01915,HG01956,HG01958,HG01985,...,HG03998,HG03999,HG04003,HG04006,HG04033,HG04039,HG04100,HG04107,HG04210,HG04229
chr,windowStartPos,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
chr1,1048576,0.999018,0.998449,0.998391,0.998382,0.996056,0.997855,0.998783,0.999342,0.997307,0.997857,...,0.999613,0.999287,0.999081,0.999282,0.999159,0.999421,0.999303,0.999004,0.999199,0.998904
chr1,1572864,0.998555,0.998343,0.998091,0.998041,0.995728,0.997427,0.998162,0.997904,0.993232,0.997083,...,0.998952,0.999297,0.998933,0.999504,0.998792,0.999160,0.998620,0.998465,0.997599,0.998689
chr1,3145728,0.998717,0.999317,0.997376,0.997542,0.997959,0.997339,0.999479,0.997510,0.996849,0.997706,...,0.996859,0.995463,0.999480,0.999472,0.998335,0.999487,0.999605,0.998453,0.999465,0.994424
chr1,3670016,0.994881,0.995122,0.998099,0.997271,0.995126,0.998465,0.995497,0.995294,0.995938,0.993764,...,0.994139,0.993397,0.995067,0.995679,0.995572,0.995714,0.993884,0.998446,0.994561,0.994606
chr1,4194304,0.991319,0.992503,0.995498,0.995269,0.988665,0.993164,0.992012,0.989378,0.992963,0.985396,...,0.990044,0.988328,0.986996,0.994682,0.991710,0.991616,0.989252,0.996829,0.988783,0.990123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr22,46661632,0.992855,0.998108,0.997123,0.991604,0.995611,0.997993,0.996725,0.993035,0.997142,0.993774,...,0.996772,0.997333,0.996716,0.997778,0.998710,0.996440,0.994683,0.997592,0.993857,0.997107
chr22,47185920,0.972773,0.993165,0.996915,0.964609,0.996304,0.992668,0.997102,0.982467,0.994476,0.997318,...,0.992905,0.996176,0.995671,0.996420,0.996881,0.997080,0.980990,0.850338,0.993268,0.994717
chr22,47710208,0.960697,0.954503,0.934546,0.876688,0.971779,0.959750,0.953820,0.969154,0.961180,0.966885,...,0.962701,0.954041,0.963408,0.965371,0.956124,0.963311,0.959673,0.912289,0.965896,0.954486
chr22,48234496,0.816518,0.810031,0.758989,0.674237,0.804883,0.825608,0.820127,0.801460,0.831267,0.838245,...,0.833573,0.810047,0.842591,0.839120,0.813546,0.842737,0.812601,0.828690,0.866662,0.816188


In [None]:
COMP_PATH = os.path.join(DATA_PATH,"pairwise/reference")
ref_g, ref_m, ref_s = comp_data_setup(indivs, 'hg38_reference')
ref_g = ref_g[ref_g.genome_avg_spearman!='']
write_comp_dfs('ref', ref_g, ref_m, ref_s)

## Sequence comparisons

In [72]:
base = 'hsmrca_ancestral'
#wynton
COMP_PATH = os.path.join(DATA_PATH,"pairwise/hsmrca")

base_df = anc_g[['ancestor']]
comp_dict_seq = {}
print('first for')
for row in base_df.index:
    ind = str(base_df.loc[row].name)
    filename = 'SeqComps_%s_vs_%s.txt' % (ind,base)
    if os.path.exists('%s/%s' % (COMP_PATH,filename)):
        try:
            df = pd.read_table('%s/%s' % (COMP_PATH,filename))
            df = df[df.chrm != 'chrX']
            comp_dict_seq[ind] = df
        except:
            print('broken')
            continue     
print('done')

indivs = list(comp_dict_seq.keys())
### Window by window
seq_id = pd.DataFrame(index = windows_to_keep, columns = indivs)

### Genome wide averages        
base_df['seq_id']=''
print('second for')
for ind in comp_dict_seq.keys():
    df = comp_dict_seq[ind]
    df = df.set_index(['chrm','start_loc'])
    try:
        df = df.loc[windows_to_keep]
        avg_seq_id =float(np.mean(df['seqComp_raw'].astype('float')))

        base_df.seq_id.loc[ind]=avg_seq_id

        seq_id[ind]=df['seqComp_raw'].astype(float)
    except:
        print('missing data for individual: %s' % ind)
print('done')  



base_df['temp'] = base_df.index
new = base_df.temp.str.split("_", expand = True)
base_df['super_pop'] = new[0]
base_df['sub_pop'] = new[1]
base_df['sex'] = new[2]
base_df['id'] = new[3]

genome_wide = base_df.drop(columns=['temp'])
genome_wide = genome_wide[genome_wide.seq_id != '']
genome_wide.seq_id = genome_wide.seq_id.astype('float')

seq_id.columns = seq_id.columns.str.split('_', expand=True)

first for
done
second for
done


TypeError: Cannot infer number of levels from empty list