In [1]:
import pandas as pd
import numpy as np
import math
import openpyxl
import itertools
from more_itertools import roundrobin
import os

In [14]:
chart = pd.read_csv('test_csv_data/DE_Results_SampleType_DCT.csv')
chart

Unnamed: 0,FC,Std..Error,df,t.value,Pval,gene,Significance,test,fdr,fwer,score,pos_score,neg_score,List
0,0.088911,0.984257,1.996746,0.090333,0.936266,A2M,0.028601,normal vs Chronic@DCT,0.99997,1.0,0.008447,0.008447,0.000000,DE_Results_SampleType_DCT
1,-0.226913,0.499025,1.981445,-0.454713,0.694270,A4GALT,0.158472,normal vs Chronic@DCT,0.99997,1.0,0.119454,0.000000,0.119454,DE_Results_SampleType_DCT
2,-0.347173,0.238154,17.000000,-1.457766,0.163131,AAAS,0.787463,normal vs Chronic@DCT,0.99997,1.0,0.908167,0.000000,0.908167,DE_Results_SampleType_DCT
3,0.179220,0.846135,1.981646,0.211810,0.852040,AACS,0.069540,normal vs Chronic@DCT,0.99997,1.0,0.041401,0.041401,0.000000,DE_Results_SampleType_DCT
4,-0.453333,0.451216,1.129613,-1.004691,0.482729,AADAT,0.316297,normal vs Chronic@DCT,0.99997,1.0,0.476324,0.000000,0.476324,DE_Results_SampleType_DCT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11154,0.557034,1.048075,1.966344,0.531483,0.649005,ZXDA,0.187752,normal vs Chronic@DCT,0.99997,1.0,0.347422,0.347422,0.000000,DE_Results_SampleType_DCT
11155,0.200685,0.398145,17.000000,0.504049,0.620697,ZXDB,0.207121,normal vs Chronic@DCT,0.99997,1.0,0.138079,0.138079,0.000000,DE_Results_SampleType_DCT
11156,-0.335209,0.161136,1.153702,-2.080283,0.258250,ZXDC,0.587959,normal vs Chronic@DCT,0.99997,1.0,0.654717,0.000000,0.654717,DE_Results_SampleType_DCT
11157,0.761105,0.413255,1.837370,1.841735,0.217911,ZYX,0.661722,normal vs Chronic@DCT,0.99997,1.0,1.673055,1.673055,0.000000,DE_Results_SampleType_DCT


In [3]:
os.listdir('data')

['All Data WTA.xlsx',
 'DE_Results_SampleType_PCT.xlsx',
 'DE_Results_TissueType_Chronic.xlsx',
 'DE_Results_SampleType_DCT.xlsx',
 'DE_Results_SampleType_glomeruli.xlsx',
 'DE_Results_TissueType_normal.xlsx']

In [42]:
def gene_score(df):
    df['score'] = abs(df['FC'] * np.log2(df['Pval']))

    return df

def pos_neg_score(df):
    # Instances where the score is 0?
    pos = []
    neg = []

    for index, value in df.iterrows():
        pos.append(value['score'] if value['FC'] > 0 else 0)
        neg.append(value['score'] if value['FC'] < 0 else 0)
    
    df['pos_score'] = pos
    df['neg_score'] = neg

    return df

def rr_list_prep(df):
    dict = set()

    for index, value in df.iterrows():
        dict.add((value['gene'], value['test'], value['Pval'], value['FC'], value['score'], value['pos_score'], value['neg_score'], value['List']))

    return list(dict)

def rr_lists(is_pos, dir):
    lists = []
    
    if is_pos:
        for file in os.listdir(dir):
            lists.append(rr_list_prep(pd.read_csv(dir + file).sort_values(by='pos_score', ascending=False)))
    else:
        for file in os.listdir(dir):
            lists.append(rr_list_prep(pd.read_csv(dir + file).sort_values(by='neg_score', ascending=False)))

    return lists

def rr_choose(size, pval_max, score_min, *lists):
    output = set()

    pc = 0
    nc = 0

    for gene in roundrobin(*lists):
        # If pval < 0.05 and If score > 0.5
        if gene[2] < 0.5 and gene[4] > 0.5:
            if gene[3] < 0:
                if nc <= (size/2):
                    output.add(gene)
                    nc += 0
            else:
                if pc <= (size/2):
                    output.add(gene)
                    pc += 0

        if len(output) == size:
            break

    df = pd.DataFrame(columns=['gene', 'test', 'pval', 'fc', 'score', 'pos_score', 'neg_score', 'List'])

    for each in list(output):
        df.loc[len(df)] = [each[0], each[1], each[2], each[3], each[4], each[5], each[6], each[7]]

    return df

def create_csv(in_dir, out_dir):
    tables = {}

    # Reads in xl files from directory and adds them to dict
    # With xl name as key
    for item in os.listdir(in_dir)[1:]:
        tables[item] = pd.read_excel(in_dir + item)

    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)

    for key, value in zip(tables.keys(), tables.values()):
        key = key[:-5]

        df = pos_neg_score(gene_score(value))
        df['List'] = [key] * len(df)
        df.to_csv(out_dir + key + '.csv', index=False)


In [43]:
p = [0.05, 0.025, 0.01]
s = [0.5, 0.75, 0.9]

for pv in p:
    for sc in s:
        out_dir = 'pval' + str(pv).replace('.', '')  + '_score' + str(sc).replace('.', '') + '/'

        create_csv('data/', out_dir)

        pos_lists = rr_lists(True, out_dir)
        neg_lists = rr_lists(False, out_dir)

        output = rr_choose(100, pv, sc, pos_lists[0], pos_lists[1], pos_lists[2], pos_lists[3], pos_lists[4], neg_lists[0], neg_lists[1], neg_lists[2], neg_lists[3], neg_lists[4])

        output.to_csv(out_dir[:-1] + '.csv', index=False)


In [23]:
output

Unnamed: 0,gene,test,pval,fc,score,pos_score,neg_score,List
0,VEGFB,PCT vs DCT@normal,0.001563,0.650502,6.063890,6.063890,0.000000,DE_Results_TissueType_normal
1,SNCA,DCT vs glomeruli@Chronic,0.018808,1.008470,5.781077,5.781077,0.000000,DE_Results_TissueType_Chronic
2,BTBD10,normal vs Chronic@DCT,0.219617,0.439390,0.960918,0.960918,0.000000,DE_Results_SampleType_DCT
3,CARD19,normal vs Chronic@DCT,0.121028,-0.428438,1.305276,0.000000,1.305276,DE_Results_SampleType_DCT
4,C7orf31,PCT vs DCT@normal,0.097399,-0.605838,2.035585,0.000000,2.035585,DE_Results_TissueType_normal
...,...,...,...,...,...,...,...,...
95,JAG1,normal vs Chronic@glomeruli,0.162257,-0.702606,1.843390,0.000000,1.843390,DE_Results_SampleType_glomeruli
96,FYN,normal vs Chronic@PCT,0.139352,-0.525604,1.494393,0.000000,1.494393,DE_Results_SampleType_PCT
97,PLPP1,DCT vs glomeruli@Chronic,0.000925,2.373146,23.916945,23.916945,0.000000,DE_Results_TissueType_Chronic
98,AAMP,DCT vs glomeruli@Chronic,0.139760,0.335517,0.952526,0.952526,0.000000,DE_Results_TissueType_Chronic


In [29]:
int(0.5)

0

In [45]:
pd.read_csv('pval0025_score075.csv').sort_values(by=['pos_score'], ascending=False)

Unnamed: 0,gene,test,pval,fc,score,pos_score,neg_score,List
67,CAPN2,PCT vs DCT@normal,0.000079,1.347353,18.351172,18.351172,0.000000,DE_Results_TissueType_normal
28,FAU,PCT vs DCT@normal,0.000008,0.908270,15.329515,15.329515,0.000000,DE_Results_TissueType_normal
56,ARHGEF26,DCT vs glomeruli@Chronic,0.000434,1.249898,13.959674,13.959674,0.000000,DE_Results_TissueType_Chronic
0,VEGFB,PCT vs DCT@normal,0.001563,0.650502,6.063890,6.063890,0.000000,DE_Results_TissueType_normal
1,SNCA,DCT vs glomeruli@Chronic,0.018808,1.008470,5.781077,5.781077,0.000000,DE_Results_TissueType_Chronic
...,...,...,...,...,...,...,...,...
43,CCDC91,DCT vs glomeruli@Chronic,0.049804,-0.627050,2.713614,0.000000,2.713614,DE_Results_TissueType_Chronic
42,IGFBP5,normal vs Chronic@glomeruli,0.006942,-1.677519,12.028459,0.000000,12.028459,DE_Results_SampleType_glomeruli
41,CWC27,normal vs Chronic@DCT,0.236108,-0.758921,1.580440,0.000000,1.580440,DE_Results_SampleType_DCT
39,CCDC14,normal vs Chronic@glomeruli,0.006545,-1.318088,9.563307,0.000000,9.563307,DE_Results_SampleType_glomeruli
