In [104]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from scipy.stats import permutation_test, false_discovery_control, spearmanr
import pybedtools
import gene_expression
import process as pfn

from warnings import filterwarnings
filterwarnings("ignore", category=pd.errors.DtypeWarning)
filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
%load_ext autoreload
#set up paths
#=========================
workdir = '/home/AD/rkgadde/L1IP'
annotation = f'{workdir}/hg38_data/annotations/gencode.v46.basic.annotation.sorted.genes.gff3'
celltypes = f'{workdir}/celltypes.txt'

genedir = f'{workdir}/gene_data'
figdir = f'{workdir}/results/CZI/plots'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load in data

In [2]:
#load in mes
comb_df = pd.read_csv('/cndd3/dburrows/DATA/me_polymorphisms/analysis/comb_polymorphic_me.csv', sep='\t', index_col=0)
samples = pd.read_csv(f'{workdir}/samples.txt', sep='\t', header=None)

In [4]:
#process mes into joint bed file
#process l1
me_type = 'L1'
abs_file = f'{workdir}/mC_data/CZI/type/vars/all_{me_type}_abs.tsv'
ins_file = f'{workdir}/mC_data/CZI/type/vars/all_{me_type}_ins.tsv'
abs_df = pd.read_csv(abs_file, sep='\t', usecols=[0,1,2,3,4,5,6,7,8], names=['chrom','start','end','id', 'length', 'strand', 'class', 'het', 'hom'])[1:]
abs_df['me_type'] = 'absence'
ins_df = pd.read_csv(ins_file, sep='\t', usecols=[0,1,2,3,4,5,6,7,8],  names=['chrom','start','end','id', 'length', 'strand', 'class', 'het', 'hom'])[1:]
ins_df['me_type'] = 'insertion'
l1_df = pd.concat([abs_df, ins_df])
l1_df['class'] = 'l1'

#process alu
me_type = 'Alu'
abs_file = f'{workdir}/mC_data/CZI/type/vars/all_{me_type}_abs.tsv'
ins_file = f'{workdir}/mC_data/CZI/type/vars/all_{me_type}_ins.tsv'
abs_df = pd.read_csv(abs_file, sep='\t', usecols=[0,1,2,3,4,5,6,7,8], names=['chrom','start','end','id', 'length', 'strand', 'class', 'het', 'hom'])[1:]
abs_df['me_type'] = 'absence'
ins_df = pd.read_csv(ins_file, sep='\t', usecols=[0,1,2,3,4,5,6, 7,8],  names=['chrom','start','end','id', 'length', 'strand', 'class', 'het', 'hom'])[1:]
ins_df['me_type'] = 'insertion'
alu_df = pd.concat([abs_df, ins_df])
alu_df['class'] = 'alu'

#add labels
comb_df = pd.concat([l1_df, alu_df])
comb_df['start'] = comb_df['start'].astype(int)
comb_df['end'] = comb_df['end'].astype(int)
comb_df['length'] = comb_df['end'] - comb_df['start']
comb_df['het'] = comb_df['het'].fillna('NaN').astype(str)
comb_df['hom'] = comb_df['hom'].fillna('NaN').astype(str)
comb_df.reset_index(drop=True, inplace=True)
het_bool = np.asarray([True if 'NaN' not in i else False for i in comb_df['het']])
hom_bool = np.asarray([True if 'NaN' not in i else False for i in comb_df['hom']])
het_count = np.asarray([len(i.split(',')) for i in comb_df['het']])
hom_count = np.asarray([len(i.split(',')) for i in comb_df['hom']])
ratio = het_count/hom_count
#set to NaN and inf
ratio[(het_bool == False) & (hom_bool==False)] = np.nan
ratio[(het_bool == True) & (hom_bool==False)] = np.inf
ratio[(het_bool == False) & (hom_bool==True)] = -1*np.inf
comb_df['het_over_hom'] = ratio

# label as majority het or hom, if >2x
genotype = np.empty(len(comb_df)).astype(str)
genotype[ratio >= 2] = 'het'
genotype[ratio <= 0.5] = 'hom'
genotype[(ratio <= 2) & (ratio >= 0.5)] = 'mixed'
genotype[np.isnan(ratio)] = 'NaN'
comb_df['genotype'] = genotype
#label as truncated or full length
trunc = np.empty(len(comb_df), dtype=object)
l1_mask = comb_df['class'] == 'l1'
trunc[l1_mask & (comb_df['length'] > 5500)] = 'full_length'
trunc[l1_mask & (comb_df['length'] <= 5500)] = 'truncated'
alu_mask = comb_df['class'] == 'alu'
trunc[alu_mask & (comb_df['length'] > 280)] = 'full_length'
trunc[alu_mask & (comb_df['length'] <= 280)] = 'truncated'
comb_df['insertion_category'] = trunc

comb_df.to_csv('/cndd3/dburrows/DATA/me_polymorphisms/analysis/comb_polymorphic_me.csv', sep='\t', header=True)

In [None]:
%%bash
#Get bed file with gene names
#==================================
cd /cndd3/dburrows/DATA/annotations/gencode/
#Split up 4th column into gene id and type
# awk 'BEGIN{OFS="\t"} {split($4, a, "_"); $4 = a[1]; $5 = a[2]; print}' gencode.v44.annotation.hg38.genelabels.bed > tmp_split.bed
# mv tmp_split.bed gencode.v44.annotation.hg38.genelabels.bed

#remove duplicated gene loci (exons from isoforms) that have the same chromosome, start+end position gene_label, and strand
# awk '!seen[$1,$2,$3,$5,$6]++' gencode.v44.annotation.hg38.genelabels.bed > tmp_seen.bed
# mv tmp_seen.bed gencode.v44.annotation.hg38.genelabels.unique.bed
awk -F'\t' '{
    # Extract necessary fields from the 9th column (attributes)
    match($9, /gene_id=([^;]+)/, gene_id);
    match($9, /gene_name=([^;]+)/, gene_name);
    match($9, /gene_type=([^;]+)/, gene_type);

    # Rearrange and print the columns
    print $1, $4, $5, $2, $3, $7, gene_id[1], gene_name[1], gene_type[1];
}' OFS="\t" gencode.v46.basic.annotation.sorted.genes.gff3 > gencode.v46.basic.annotation.sorted.genes.bed

In [3]:
# find intersection of mes with genes
#bedconvert and load annotations
me_bt = pybedtools.BedTool.from_dataframe(comb_df)
gene_df = pd.read_csv('/cndd3/dburrows/DATA/annotations/gencode/gencode.v46.basic.annotation.sorted.genes.bed', sep='\t',header=None)
gene_bt = pybedtools.BedTool.from_dataframe(gene_df)

#compute intersection
me_gene = gene_bt.intersect(me_bt, f=0.01, wo=True) 
me_gene = me_gene.to_dataframe(disable_auto_names=True, header=None)
me_gene.rename(columns={0: 'chr', 1: 'gene_start', 2: 'gene_end', 5: 'gene_strand',
                        6: 'gene_id', 7: 'gene_name', 8: 'gene_type', 10: 'me_start',
                       11: 'me_end', 12: 'me_id', 14: 'me_strand', 15: 'me_class', 16: 'het',
                       17: 'hom', 18: 'me_type', 19: 'het_hom_ratio', 20:'genotype', 
                       21: 'insertion_category', 22: 'length'}, inplace=True)
me_gene.drop(columns=np.asarray(me_gene.columns[(np.asarray([type(i) for i in me_gene.columns]) == int)]), inplace=True)

In [None]:
# find intersection with genomic regions

#compute intersection
l1_cgi = l1_bt.intersect(cgi_bt, f=0.3, wo=True) 
alu_cgi = alu_bt.intersect(cgi_bt, f=0.3, wo=True) 
alu_all = alu_bt.intersect(all_bt, f=0.3, wo=True) 
l1_all = all_bt.intersect(l1_bt, f=0.3, wo=True) 

l1_cgi_df = l1_cgi.to_dataframe(disable_auto_names=True, header=None)
l1_cgi_df.rename(columns={16: 'region', 9: 'me_type', 11: 'genotype', 12: 'insertion_category', 7: 'het', 8: 'hom', 4: 'length'}, inplace=True)
alu_cgi_df = alu_cgi.to_dataframe(disable_auto_names=True, header=None)
alu_cgi_df.rename(columns={16: 'region', 9: 'me_type', 11: 'genotype', 12: 'insertion_category', 7: 'het', 8: 'hom', 4: 'length'}, inplace=True)
alu_all_df = alu_all.to_dataframe(disable_auto_names=True, header=None)
alu_all_df.rename(columns={16: 'region', 9: 'me_type', 11: 'genotype', 12: 'insertion_category', 7: 'het', 8: 'hom', 4: 'length'}, inplace=True)
l1_all_df = l1_all.to_dataframe(disable_auto_names=True, header=None)
l1_all_df.rename(columns={3: 'region', 15: 'me_type', 17: 'genotype', 18: 'insertion_category', 13: 'het', 14: 'hom', 10: 'length'}, inplace=True)

# eQTL analysis

In [308]:
from tqdm import tqdm
import time
#run for MEs directly overlapping genes
#=====================================

#load in + filter cpm data
me_gene['gene_id'] = np.asarray([i.split('.')[0] for i in me_gene['gene_id']])
cpm_path = '/cndd2/jchien/project/CZI_human/pseudobulk_rna/logcpm/*_combinesample_log2cpm+1.tsv.gz'
pattern = '_combinesample_log2cpm+1.tsv.gz'
cpm = pfn.combine_genes_bycell_filter(rna = '/cndd2/jchien/project/CZI_human/pseudobulk_rna/logcpm/*_combinesample_log2cpm+1.tsv.gz'
                                      , pattern = '_combinesample_log2cpm+1.tsv.gz'
                                      , mini = 0
                                      , n_samp = 0)

#extract sample and gene values
samples = cpm['sample'].unique()
cpm_unq = cpm['gene_id'].unique()

out_df = pd.DataFrame()
it=range(len(me_gene))
#iterate over each gene
#============================
for i in tqdm(it, leave=True):
    time.sleep(0.1)
    curr = pd.DataFrame(me_gene.iloc[i]).T
    cpm_gene = cpm[cpm['gene_id'] == curr['gene_id'].values[0]]
    #skip genes that are not included in the filtered list
    if len(cpm_gene) > 0:
        assert len(cpm_gene) == len(cpm_gene['celltype'].unique())*len(samples), 'some samples missing' #check that the correct n samples are there
    
        #define design matrix for model
        #============================
        cpm_gene.set_index(['celltype', 'sample'], inplace=True)
        cpm_gene.drop(columns='gene_id', inplace=True)
        cpm_gene['age'] = [i[0] for i in cpm_gene.index.get_level_values('sample')]
        cpm_gene['sex'] = [i[1] for i in cpm_gene.index.get_level_values('sample')]
        cpm_gene['age_encoded'] = cpm_gene['age'].map({'Y': 0, 'A': 1})
        cpm_gene['sex_encoded'] = cpm_gene['sex'].map({'M': 0, 'F': 1})
        
        #define me_type values, 0,1,2
        if curr['me_type'].values == 'insertion': 
            hom_val = 2
            wt_val = 0
        if curr['me_type'].values == 'absence':
            hom_val = 0
            wt_val = 2
        vec = np.full(len(cpm_gene), np.nan)
        vec[cpm_gene.index.get_level_values('sample').str.contains(curr['het'].values[0].replace(',','|'))] = 1
        vec[cpm_gene.index.get_level_values('sample').str.contains(curr['hom'].values[0].replace(',','|'))] = hom_val
        wt = ','.join(samples[np.in1d(samples, np.append(curr['het'].values[0].split(','), 
                                                        curr['hom'].values[0].split(','))) == False]).replace(',','|')
        if len(wt) > 0: vec[cpm_gene.index.get_level_values('sample').str.contains(wt)] = wt_val
        cpm_gene['me_type'] = vec
    
        #run model
        #==========
        out = cpm_gene.groupby('celltype').apply(pfn.run_regression).reset_index()
        out['contrast'] = f"{curr['me_id'].values[0]}_{curr['gene_id'].values[0]}"
        out_df = pd.concat([out_df, out])

#get FDR corrected p values
group = out_df[out_df['level_1']=='me_type']
nanbool = np.isnan(group['p_values'])
group = group[~nanbool]
pv = group['p_values']
reject, pvals_corrected, _, _ = multipletests(pv, alpha=0.05, method='fdr_bh')
results = pd.DataFrame({
    'coefficients': group['coefficients'],
    'p_values': group['p_values'],
    'contrast' : group['contrast'],
    'padj': pvals_corrected
})
#get FDR corrected p values
#sig_df = sig_df.groupby('celltype').apply(fdr).reset_index()
group = out_df[out_df['level_1']=='me_type']
nanbool = np.isnan(group['p_values'])
group = group[~nanbool]
pv = group['p_values']
reject, pvals_corrected, _, _ = multipletests(pv, alpha=0.05, method='fdr_bh')
group['padj'] = pvals_corrected
sig_df = group[group['padj'] < 0.01].reset_index(drop=True)


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 127/127 [02:03<00:00,  1.03it/s]
  sig_df = sig_df.groupby('celltype').apply(fdr).reset_index()


In [353]:
sig_df

Unnamed: 0,celltype,level_1,coefficients,p_values,contrast,padj
0,CGE_ADARB2_ADAM33,me_type,0.856286,6.541094e-06,CZI_abs_64_ENSG00000143452,0.000352
1,L6IT_THEMIS_LINC00343,me_type,0.872579,2.418175e-04,CZI_abs_64_ENSG00000143452,0.008542
2,L6IT_THEMIS_LINC00343,me_type,-0.114944,2.653709e-04,CZI_abs_66_ENSG00000226949,0.009238
3,L3-5IT_RORB_PLCH1,me_type,-0.835541,2.408572e-04,CZI_abs_193_ENSG00000203258,0.008542
4,CGE_LAMP5,me_type,2.064408,6.466298e-05,CZI_abs_302_ENSG00000110801,0.002589
...,...,...,...,...,...,...
65,L6IT_THEMIS_LINC00343,me_type,2.073876,4.136368e-07,CZI_abs_1301_ENSG00000135185,0.000050
66,L6b_TLE4_NXPH4,me_type,2.059364,9.978457e-07,CZI_abs_1301_ENSG00000135185,0.000085
67,MGE_PVALB,me_type,2.252902,4.117493e-07,CZI_abs_1301_ENSG00000135185,0.000050
68,MGE_SST,me_type,2.145040,1.223470e-08,CZI_abs_1301_ENSG00000135185,0.000005


In [365]:
cpm_gene

Unnamed: 0_level_0,Unnamed: 1_level_0,cpm,age,sex,age_encoded,sex_encoded,me_type
celltype,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L4-5IT_RORB_TSHZ2,AF1,0.383286,A,F,1,1,0.0
L4-5IT_RORB_TSHZ2,AF2,1.516757,A,F,1,1,0.0
L4-5IT_RORB_TSHZ2,AF3,0.376111,A,F,1,1,0.0
L4-5IT_RORB_TSHZ2,AM1,0.783174,A,M,1,0,0.0
L4-5IT_RORB_TSHZ2,YF1,0.273898,Y,F,0,1,0.0
...,...,...,...,...,...,...,...
L56NP_TLE4_TSHZ2,YF1,0.378245,Y,F,0,1,0.0
L56NP_TLE4_TSHZ2,YF2,0.427399,Y,F,0,1,0.0
L56NP_TLE4_TSHZ2,YM1,0.481128,Y,M,0,0,0.0
L56NP_TLE4_TSHZ2,YM2,1.584667,Y,M,0,0,0.0


In [364]:
curr_sig = sig_df.iloc[0]
curr_gene_df = me_gene[me_gene['contrast'] == curr_sig['contrast']]
curr_cpm_df = cpm[cpm['gene_id'] == curr_gene_df['gene_id'].values[0]]
curr_cpm_df

Unnamed: 0,sample,gene_id,cpm,celltype
2436,AF1,ENSG00000143452,0.556982,L4-5IT_RORB_TSHZ2
48651,AF2,ENSG00000143452,1.850423,L4-5IT_RORB_TSHZ2
94866,AF3,ENSG00000143452,0.565452,L4-5IT_RORB_TSHZ2
141081,AM1,ENSG00000143452,1.547170,L4-5IT_RORB_TSHZ2
187296,YF1,ENSG00000143452,2.283271,L4-5IT_RORB_TSHZ2
...,...,...,...,...
7746859,YF1,ENSG00000143452,1.980990,L56NP_TLE4_TSHZ2
7790066,YF2,ENSG00000143452,0.000000,L56NP_TLE4_TSHZ2
7833273,YM1,ENSG00000143452,3.091000,L56NP_TLE4_TSHZ2
7876480,YM2,ENSG00000143452,0.906683,L56NP_TLE4_TSHZ2


In [362]:
me_gene[me_gene['contrast'] == curr_sig['contrast']]

Unnamed: 0,chr,gene_start,gene_end,gene_strand,gene_id,gene_name,gene_type,me_start,me_end,me_id,me_strand,me_class,het,hom,me_type,het_hom_ratio,genotype,insertion_category,length,contrast
7,chr1,150698060,150720895,-,ENSG00000143452,HORMAD1,protein_coding,150718974,150719293,CZI_abs_64,+,alu,.,.,absence,.,.,full_length,319,CZI_abs_64_ENSG00000143452


In [358]:
me_gene['contrast'] = [me_gene.iloc[i]['me_id'] + '_' + me_gene.iloc[i]['gene_id'] for i in range(len(me_gene))]


In [None]:
def celltype_designmatrix():
    

In [None]:
curr = pd.DataFrame(me_gene.iloc[i]).T

In [None]:
cpm_gene = cpm[cpm['gene_id'] == curr['gene_id'].values[0]]
#skip genes that are not included in the filtered list
if len(cpm_gene) > 0:
    assert len(cpm_gene) == len(cpm_gene['celltype'].unique())*len(samples), 'some samples missing' #check that the correct n samples are there

    #define design matrix for model
    #============================
    cpm_gene.set_index(['celltype', 'sample'], inplace=True)
    cpm_gene.drop(columns='gene_id', inplace=True)
    cpm_gene['age'] = [i[0] for i in cpm_gene.index.get_level_values('sample')]
    cpm_gene['sex'] = [i[1] for i in cpm_gene.index.get_level_values('sample')]
    cpm_gene['age_encoded'] = cpm_gene['age'].map({'Y': 0, 'A': 1})
    cpm_gene['sex_encoded'] = cpm_gene['sex'].map({'M': 0, 'F': 1})
    
    #define me_type values, 0,1,2
    if curr['me_type'].values == 'insertion': 
        hom_val = 2
        wt_val = 0
    if curr['me_type'].values == 'absence':
        hom_val = 0
        wt_val = 2
    vec = np.full(len(cpm_gene), np.nan)
    vec[cpm_gene.index.get_level_values('sample').str.contains(curr['het'].values[0].replace(',','|'))] = 1
    vec[cpm_gene.index.get_level_values('sample').str.contains(curr['hom'].values[0].replace(',','|'))] = hom_val
    wt = ','.join(samples[np.in1d(samples, np.append(curr['het'].values[0].split(','), 
                                                    curr['hom'].values[0].split(','))) == False]).replace(',','|')
    if len(wt) > 0: vec[cpm_gene.index.get_level_values('sample').str.contains(wt)] = wt_val
    cpm_gene['me_type'] = vec

    #run model
    #==========
    out = cpm_gene.groupby('celltype').apply(pfn.run_regression).reset_index()
    out['contrast'] = f"{curr['me_id'].values[0]}_{curr['gene_id'].values[0]}"
    out_df = pd.concat([out_df, out])

In [352]:
#by celltype, show each significant gene

#group genes together for each celltype (big up/down), show joint plot

Unnamed: 0_level_0,Unnamed: 1_level_0,cpm,age,sex,age_encoded,sex_encoded,me_type
celltype,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L4-5IT_RORB_TSHZ2,AF1,0.383286,A,F,1,1,0.0
L4-5IT_RORB_TSHZ2,AF2,1.516757,A,F,1,1,0.0
L4-5IT_RORB_TSHZ2,AF3,0.376111,A,F,1,1,0.0
L4-5IT_RORB_TSHZ2,AM1,0.783174,A,M,1,0,0.0
L4-5IT_RORB_TSHZ2,YF1,0.273898,Y,F,0,1,0.0
...,...,...,...,...,...,...,...
L56NP_TLE4_TSHZ2,YF1,0.378245,Y,F,0,1,0.0
L56NP_TLE4_TSHZ2,YF2,0.427399,Y,F,0,1,0.0
L56NP_TLE4_TSHZ2,YM1,0.481128,Y,M,0,0,0.0
L56NP_TLE4_TSHZ2,YM2,1.584667,Y,M,0,0,0.0


In [None]:
#by gene, show all cells, identify celltype differences

In [None]:
#do stats of insertion type, coefficients by insertion location -> i.e. where in the gene; by full length; by l1 vs alu


In [347]:
sig_df['contrast'][0]

'CZI_abs_64_ENSG00000143452'

In [348]:
out_df

Unnamed: 0,celltype,level_1,coefficients,p_values,contrast
0,CGE_ADARB2_ADAM33,const,0.177444,0.772504,CZI_abs_7_ENSG00000230703
1,CGE_ADARB2_ADAM33,age_encoded,0.172562,0.658607,CZI_abs_7_ENSG00000230703
2,CGE_ADARB2_ADAM33,sex_encoded,0.172562,0.658607,CZI_abs_7_ENSG00000230703
3,CGE_ADARB2_ADAM33,me_type,-0.099603,0.869062,CZI_abs_7_ENSG00000230703
4,CGE_LAMP5,const,0.413008,0.034756,CZI_abs_7_ENSG00000230703
...,...,...,...,...,...
75,MGE_SST,me_type,0.000000,,CZI_abs_1499_ENSG00000230676
76,MGE_SST_CLMP,const,0.783821,0.011151,CZI_abs_1499_ENSG00000230676
77,MGE_SST_CLMP,age_encoded,-0.247380,0.429813,CZI_abs_1499_ENSG00000230676
78,MGE_SST_CLMP,sex_encoded,-0.493100,0.142570,CZI_abs_1499_ENSG00000230676


In [349]:
me_gene

Unnamed: 0,chr,gene_start,gene_end,gene_strand,gene_id,gene_name,gene_type,me_start,me_end,me_id,me_strand,me_class,het,hom,me_type,het_hom_ratio,genotype,insertion_category,length
0,chr1,24066774,24083565,+,ENSG00000230703,MYOM3-AS1,lncRNA,24076520,24076812,CZI_abs_7,-,alu,"AF1,AF2,AF3,AM1,YF1,YM1,YM2,YM3","AM3,YF2",absence,4.0,het,full_length,292
1,chr1,55915603,55944980,-,ENSG00000230250,LINC01753,lncRNA,55935619,55935930,CZI_abs_19,+,alu,.,AF1,absence,-inf,hom,full_length,311
2,chr1,56154545,56477687,-,ENSG00000260971,ENSG00000260971,lncRNA,56365451,56369289,CZI_abs_1543,+,l1,AF3,"AM1,AM2,AM3,YF2,YM3",absence,0.2,hom,truncated,3838
3,chr1,56173433,56524495,-,ENSG00000284686,ENSG00000284686,protein_coding,56365451,56369289,CZI_abs_1543,+,l1,AF3,"AM1,AM2,AM3,YF2,YM3",absence,0.2,hom,truncated,3838
4,chr1,57605847,57606206,-,ENSG00000236888,RPS20P5,processed_pseudogene,57606184,57606197,CZI_ins_818,-,alu,"AM3,YF1","AF1,AF2,AF3,AM1,AM2,YM1,YM3",insertion,0.2857142857142857,hom,truncated,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,chr8,143636019,143656418,+,ENSG00000183309,ZNF623,protein_coding,143641445,143641764,CZI_abs_1430,-,alu,.,"AF2,AM3,YM1,YM2",absence,-inf,hom,full_length,319
123,chr8,144772224,144787348,-,ENSG00000196378,ZNF34,protein_coding,144776325,144776913,CZI_abs_1433,+,alu,"AF1,AF2,AF3,AM1,AM2,AM3,YF2,YM2,YM3",YM1,absence,9.0,het,full_length,588
124,chr9,35791003,35809732,+,ENSG00000159899,NPR2,protein_coding,35803110,35803402,CZI_abs_1464,-,alu,"AF1,AM1,AM2,YF1,YF2,YM2,YM3","AF2,AF3,AM3",absence,2.333333333333333,het,full_length,292
125,chr9,73042745,73056528,+,ENSG00000228024,CYP1D1P,unprocessed_pseudogene,73054613,73054880,CZI_abs_1473,+,alu,.,YM1,absence,-inf,hom,truncated,267


In [350]:
cpm

Unnamed: 0,sample,gene_id,cpm,celltype
0,AF1,ENSG00000223972,0.000000,L4-5IT_RORB_TSHZ2
1,AF1,ENSG00000227232,0.729183,L4-5IT_RORB_TSHZ2
2,AF1,ENSG00000243485,0.069128,L4-5IT_RORB_TSHZ2
3,AF1,ENSG00000240361,0.147933,L4-5IT_RORB_TSHZ2
4,AF1,ENSG00000186092,0.055565,L4-5IT_RORB_TSHZ2
...,...,...,...,...
7960576,YM3,ENSG00000198695,8.790963,L56NP_TLE4_TSHZ2
7960577,YM3,ENSG00000210194,2.029100,L56NP_TLE4_TSHZ2
7960578,YM3,ENSG00000198727,10.021181,L56NP_TLE4_TSHZ2
7960579,YM3,ENSG00000210195,0.535441,L56NP_TLE4_TSHZ2


In [None]:
#subset genes -> define distance, output column with list of genes to test -> just get csv of each gene -> me interaction


#define model, based on het/hom and absence/insertion info; get full sample list

In [None]:
#visualise significant effects, by celltype, etc.

# metheQTL analysis