In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import zscore
from IPython import display as disp

In [None]:
outfolder = "/Users/fekeann/Documents/Lab/RNAseq/renormalized_data/correlations/newSynt/"
if not os.path.exists(outfolder):
    os.mkdir(outfolder)

In [None]:
tissue_simple = ["ClosedFlower", "HookedStolon", "ImmFruit", "MatureFruit",
                 "OpenFlower", "Sprout", "StemControl", "SwollenStolon",
                 "TuberS1", "TuberS2", "TuberS3", "TuberS4", "TuberS5",
                 "YL10am", "ColdLeafControl", "RootControl"] 
                    # "ColdLeafControl was chosen at random as a leaf control, 
                    # since I believe its a more mature leaf than "YL"
                    # all/most samples were taken at 10am so that's why I picked YL10am

stress_with_control = ["BTH", "BTHControl", "ColdLeaf", "ColdLeafControl",
                       "DroughtLeaf", "DroughtRoot", "Heat", "HeatControl",
                       "Meja", "MejaControl", "RootControl", "SaltLeaf",
                       "SaltLeafControl", "SaltRoot"]

In [None]:
shd_trace_file = "/Users/fekeann/Documents/Lab/RNAseq/renormalized_data/rlogs/trace_format/full_diel_norm_atl_shd_leaf.csv"
shd_tuber_trace_file = "/Users/fekeann/Documents/Lab/RNAseq/renormalized_data/rlogs/trace_format/full_diel_norm_atl_shd_tuber.csv"
lgd_trace_file = "/Users/fekeann/Documents/Lab/RNAseq/renormalized_data/rlogs/trace_format/full_diel_norm_atl_lgd_leaf.csv"
tissue_rlog_all = "/Users/fekeann/Documents/Lab/RNAseq/Tissues/deseq_outputs/all_tissues_outfile.csv"
synt_file = '/Users/fekeann/Documents/Lab/RNAseq/Syntelogs/final_selections/publication_output/syntelogs_atl_only.csv'

In [None]:
tc_dict = {"shd":shd_trace_file,
           "lgd":lgd_trace_file,
           "tuber":shd_tuber_trace_file,
           "tissue":tissue_rlog_all}

In [None]:
syntelogs = pd.read_csv(synt_file)
syntelogs = syntelogs[syntelogs.geneID.str.contains("Atl")].copy()
syntelogs["Haplotype"] = syntelogs.geneID.apply(lambda x: x.split("_")[-1][0])

#get rid of "S" haplotypes; we only want those with known chromosomes
syntelogs = syntelogs[syntelogs.Haplotype.isin(["0", "1","2","3","4"])].copy()

# remove isoform information from the geneIDs
syntelogs["geneID"] = syntelogs.geneID.apply(lambda x: ".".join(x.split(".")[:-1]))

In [None]:
def avg_zscore_format(path, dtype="timecourse", use_cols = None):
    """
    Converts trace files with replication to the z-scored version, averaged by time
    INPUTS:
        path:     str, the filepath to the rlog trace file
        dtype:    str, refers to what type of dataset is found in path
                  options: "timecourse" or "tissue"; 
                  "timecourse" is used for LgD or ShD (leaf or tuber)
                  "tissue" is used for stress or tissue datasets
        use_cols: str or None, refers to which columns to use for "tissue" dtype
                  options: "tissue" or "stress"
                  references the lists tissue_simple or stress_with_control in cell 3

    OUTPUTS:
        A pandas df containing the z-scored, time-averaged traces
    """
    
    data = pd.read_csv(path)

    # the tissue file doesn't have "geneID" as the column label
    if dtype == "tissue":
        data.rename({"Unnamed: 0":"geneID"}, axis=1, inplace=True)
    
    data.set_index("geneID", inplace=True)
    
    if dtype == "tissue":
        # we only care about the first two terms of the header; 
        # the replicate information is required since we can't have two identically named columns
        data.rename({col: "_".join(col.split("_")[:2]) for col in data.columns},axis=1, inplace=True)
        tissue_dict = {}
        # filter the dataset to just having the appropriate columns for the analysis being done
        for col in data.columns:
            sample = col.split("_")[0]
            tissue_dict[col] = sample
        if use_cols == "stress":
            data = data[[col for col in data.columns if tissue_dict[col] in stress_with_control]]
        elif use_cols == "tissue":
            data = data[[col for col in data.columns if tissue_dict[col] in tissue_simple]]

    #convert from wide to long; sample type is now "time" even if it's a tissue/stress.
    data = data.stack().reset_index().rename({"level_1":"time", 0:"mean_expression"}, axis=1)

    #get rid of replicate number from the sample nfo
    if dtype == "tissue":
        data["time"] = data["time"].apply(lambda x: x.split("_")[0])
    else:
        data["time"] = data["time"].apply(lambda x: int(x.split("_")[0].split("T")[-1]))
        
    #average the expresson within gene and condition
    data = data.groupby(['geneID', 'time']).mean().reset_index().set_index("geneID")

    # calculate the z-score for the averaged dataset
    zscores = data.pivot(columns="time").apply(lambda row: zscore(row, ddof=1), axis=1)
    zscores = zscores.droplevel(level=0, axis=1)

    #remove isoform information from geneID
    zscores["geneID"] = zscores.index.map(lambda x: ".".join(x.split(".")[:-1]))
    zscores.set_index("geneID", inplace=True)

    return zscores

In [None]:
def calculate_pair_corr(zscores, pair):
    """
    Calculate the correlation of two expression patterns; returning None if one is un-expressed
    INPUTS:
        zscores: a df containing the z-scored expression patterns, indexed by "geneID"
        pair:    a list of strs of length 2, containing the geneID of the genes to be correlated

    OUTPUT:
        a float or None, representing the correlation between the two patterns
    """
    
    gene1 = zscores[zscores.index==pair[0]]
    gene2 = zscores[zscores.index==pair[1]]
    if gene1.shape[0] > 0 and gene2.shape[0] > 0:
        gene1 = gene1.squeeze()
        gene2 = gene2.squeeze()
        return gene1.corr(gene2)
    else:
        return None

In [None]:
def correlation_df(tc_zscores, gene_groups=syntelogs):
    """
    Generate a pandas df containing the pairwise correlations of 
    expression patterns of genes as indicated in gene_groups

    INPUTS:
        tc_zscores:  a df containing the z-scored expression patterns of all genes, indexed by "geneID"
        gene_groups: a df containing one column of "geneIDs" and one of groupings ("Syntelog")

    OUTPUT:
        a pandas df in the format of gene1 gene2 corr
    """
    
    synt_list = []
    hap_a = []
    hap_b = []
    corrs = []
    i = 0 # for status updates
    n = len(gene_groups.Syntelog.unique())

    #iterate over all the groupings
    for synt in gene_groups.Syntelog.unique():

        # update status
        print(f"Correlating Syntelog {i} of {n}", end="\r")
        i += 1
        
        #filter to the current syntelog group and get the list of geneIDs
        cur_synt = gene_groups[gene_groups.Syntelog==synt]
        cur_haps = cur_synt.geneID.unique()

        # use combinations to get all pairwise combinations, regardless of orientation
        for cur_pair in combinations(cur_haps,2):
            cur_genes = cur_synt[cur_synt.geneID.isin(cur_pair)].geneID.to_list()
            # calculate the correlation of that pare
            cur_corr = calculate_pair_corr(tc_zscores, cur_genes)
            #we only want to record the correlation is they're both present in the file
            if cur_corr is not None:
                synt_list.append(synt)
                hap_a.append(cur_pair[0])
                hap_b.append(cur_pair[1])
                corrs.append(cur_corr)
                
    # convert all the output list to a dataframe
    output = pd.DataFrame({"Syntelog":synt_list, "Haplotype_1":hap_a,
                           "Haplotype_2":hap_b, "Correlation":corrs})
    return output

In [None]:
broken = False
time_courses = ["shd", "lgd", "tuber", "tissue_rlog_tissue", "tissue_rlog_stress"]
mixer.music.load(attention_song)
done = False
for i, tc in enumerate(time_courses):
    # figure out which files and types and columns to use based on which tc is current
    print(f"On {tc}")
    use_cols = None
    if "tissue" in tc:
        if "stress" in tc:
            use_cols = "stress"
        elif "tissue" in tc:
            use_cols = "tissue"
        use_tc = "tissue"
        dtype = "tissue"
    else:
        use_tc = tc
        dtype = "timecourse"

    # make the z-score df
    print("Calculating z-scores", end="\r")
    tc_zscores = avg_zscore_format(tc_dict[use_tc], dtype, use_cols)
    
    #make the correlation df and save it to a csv
    output = correlation_df(tc_zscores)
    output.to_csv(f"{outfolder}{tc}_pairwise_corrs.csv", index=False)
    disp.clear_output(wait=True)