# python3 Linear.py currTissue CHR

In [None]:
import math
import numpy as np
import pandas as pd
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
import time 
import sys

In [None]:
currTissue = sys.argv[1]
CHR = sys.argv[2]
MINSAMPLE = 3

In [None]:
# Z normalization
def ZNorm(vals):
    m = np.mean(vals)
    sd = np.sqrt(np.var(vals))
    if type(sd) == pd.Series:
        zVar = True
        for x in sd:
            if x != 0: zVar = False
        if zVar == False:
            return (vals-m)/sd, False
        else:
            return None, True
    else:
        if sd == 0: return None, True
        return [(item-m)/sd for item in vals], False
# Linear Regression
def LinearRegression(X, Y):
    """
    Perform linear regression, return beta, beta_se, p
    """
    X,returnX = ZNorm(X)
    Y,returnY = ZNorm(Y)
    if returnX == True or returnY == True or len(X) <= 3:
        return None,None,None,None
    
    X = X.dropna(axis = 1)

    X = sm.add_constant(X)
    mod_ols = sm.OLS(Y, X, missing='drop')
    res_ols = mod_ols.fit()
    pval = res_ols.pvalues.loc["x1+x2"]
    slope = res_ols.params.loc["x1+x2"]
    err = res_ols.bse.loc["x1+x2"]
    return res_ols, slope, err, pval

In [None]:
# load the population PCs
df = pd.read_csv("/projects/ps-gymreklab/ydong/data/genotypePCA/GTEx_1KG_merged_650.pca.evec", header = None,skiprows = 1)
# df = pd.read_csv("/gymreklab-tscc/ydong/data/genotypePCA/GTEx_1KG_merged_650.pca.evec", header = None,skiprows = 1)
# trim white spaces from data
df[1] = df[0].apply(lambda x : list(filter(lambda a: a !="" ,x.split(" "))))
df[2] = df[1].apply(lambda x: x[0])

samples = list(df[2])
pcDF = pd.DataFrame({"sample":samples})

for i in range(5):
    name = "PC" + str(i+1)
    pcDF[name] = df[1].apply(lambda x: x[i+1])

pcDF = pcDF.set_index("sample")
pcDF = pcDF.astype("float")


In [None]:
# The chromosome we're interested in studying
start = time.time()


CHR = str(CHR)
print("CHR: " + CHR)
DIST = 10000

#
PSI_Path = "/projects/ps-gymreklab/ydong/data/allTis/" +currTissue+"/PSI/" + CHR + "_PSI.csv"
STR_Path = "/projects/ps-gymreklab/ydong/data/STRs/" + CHR + ".csv"
annot_Path = "/projects/ps-gymreklab/ydong/data/gtexRNA/gencode_gene_annotations_GRCh38.csv"

# PSI_Path = "/gymreklab-tscc/ydong/data/allTis/" +currTissue+"/PSI/" + CHR + "_PSI.csv"
# STR_Path = "/gymreklab-tscc/ydong/data/STRs/" + CHR + ".csv"
# annot_Path = "/gymreklab-tscc/ydong/data/gtexRNA/gencode_gene_annotations_GRCh38.csv"



PSI_df = pd.read_csv(PSI_Path, index_col = 0)
STR_df = pd.read_table(STR_Path,dtype = {"start":int})
annot_df = pd.read_csv(annot_Path,index_col = "probe.id", dtype = {"probe.chr":str})
annot_df = annot_df[annot_df["probe.chr"] == CHR]

peer_df = pd.read_csv("/projects/ps-gymreklab/ydong/data/allTis/"+currTissue + "/" + "peerFactor.csv",index_col = 0)
# peer_df = pd.read_csv("/gymreklab-tscc/ydong/data/allTis/"+currTissue + "/" + currTissue+"PEER.csv",index_col = 0)


peer_df = peer_df.drop("V1",axis = 1)

# Find the overlapping GTEx individual for STR data and PSI data
strSamples = list(STR_df.columns[2:])
psiSamples = list(PSI_df.index)
peer_df.index = psiSamples
overlapSamples = list(set(strSamples)&set(psiSamples))
PSI_df = PSI_df.loc[overlapSamples,:]
peer_df = peer_df.loc[overlapSamples,:]
STR_df = STR_df [["chrom","start"] +overlapSamples]



psiSamples = list(PSI_df.columns)
res_df = pd.DataFrame(columns =[ "chrom", "gene","str.id", "str.start", "n.miss", "slope", "slope_p", "error","error_p",  "pVal","pVal_p","start.dist", "stop.dist"] )


for gene in psiSamples:
    
    print(gene)
    
    geneStart = annot_df.loc[gene,"probe.start"]
    geneStop = annot_df.loc[gene,"probe.stop"]
    cis_df  = STR_df[(STR_df["start"] >= geneStart - DIST) & (STR_df["start"] <= geneStop + DIST)]


    if len(cis_df) == 0:
        continue

    # deal with multiple matching of strs and target gene
    indList = list(cis_df.index)

    for ind in indList:
        locus_str = pd.DataFrame(cis_df.loc[ind,cis_df.columns[2:]])
        test_str = "STR_" + str(cis_df.loc[ind,"start"])
        locus_str.columns = [test_str]
        locus_str['x1'] = locus_str[test_str].apply(lambda x: x.split(',')[0] )
        locus_str['x2'] = locus_str[test_str].apply(lambda x: x.split(',')[1] )

        # filter out the nan STRs
        samples_to_keep = [overlapSamples[k] for k in range(len(overlapSamples)) if str(locus_str.iloc[:,0].values[k]) != "None" and  str(locus_str.iloc[:,0].values[k]) != 'NA,NA']
        locus_str = locus_str.loc[samples_to_keep,:]
        locus_str['x1+x2'] = locus_str[test_str].apply(lambda x: int(x.split(',')[0]) + int(x.split(',')[1]))

        # filter out all 0 STRs
        if locus_str["x1+x2"].sum() == 0:
            continue

        exonStartDist = cis_df.loc[ind,"start"] - geneStart
        exonStopDist = cis_df.loc[ind,"start"] - geneStop

        # get the PSI matrix for current str and current gene
        currPSI = PSI_df.loc[samples_to_keep,gene]

        # get the covariate Population PCs
        currpcDF = pcDF.loc[samples_to_keep]

        # get the covariate PEER factors
        currPeer = peer_df.loc[samples_to_keep]

        # add in covariates for the str sums
        xDF = pd.concat([locus_str["x1+x2"],currpcDF,currPeer],axis = 1)


        # linear regression
        res_ols, slope, err, pval = LinearRegression(xDF, currPSI)

        if res_ols == None and slope == None and err == None and pval == None:
            continue
        
         # Permutation Test to validate statistical significance
        currPSI_p = random.sample(list(currPSI),len(currPSI))
        res_ols_p, slope_p, err_p, pval_p = LinearRegression(xDF, currPSI_p)
        
        
        res_df = res_df.append({"chrom":CHR,"gene":gene,"str.id":test_str,"str.start":cis_df.loc[ind,"start"],
                                "n.miss": len(STR_df.columns) -2-len(locus_str), "slope":slope,"slope_p":slope_p,
                                "error":err,"error_p":err_p,"pVal":pval,"pVal_p":pval_p, "start.dist":exonStartDist, 
                                "stop.dist":exonStopDist},ignore_index=True)

res_df.to_csv("/projects/ps-gymreklab/ydong/data/allTis/" +currTissue+"/result/chr" + str(CHR) + "Res.csv")

end = time.time()

In [None]:
print(end - start)