In [1]:
import os
import sys
import pandas as pd
import re
import numpy as np
from collections import defaultdict
from scipy import stats
import mitosheet


In [2]:
#Load molecule/fluorescence file
name = 'PC6'
year = '2020'
moleculeFilePath = r'/home/erika/Downloads/CorrInput/FT_EEMs_'+year+'_' + name +'.csv'
moleculesAsDF = pd.read_csv(moleculeFilePath)

  moleculesAsDF = pd.read_csv(moleculeFilePath)


In [3]:
# Deleted columns Unnamed: 0.1, Sample, Unnamed: 0, S350_400, bix_parafac_meta, a254, a300, E2_E3, S275_295, SR, order_RI_raw, year, year_dom, Month_RI_raw, fi_RI_raw, hix_RI_raw, mhix_RI_raw, bix_RI_raw, TOC_conc, TN_conc, sample, Site, Position, Depth, date.collected, Replicate, Month_parafac_meta, order_parafac_meta, date, fi_parafac_meta, hix_parafac_meta, mhix_parafac_meta
if year == '2020':
    moleculesAsDF.drop(['Unnamed: 0.1', 'Sample', 'Unnamed: 0', 'S350_400', 'bix_parafac_meta', 'a254', 'a300', 'E2_E3', 'S275_295', 'SR', 'order_RI_raw', 'year', 'year_dom', 'Month_RI_raw', 'fi_RI_raw', 'hix_RI_raw', 'mhix_RI_raw', 'bix_RI_raw', 'TOC_conc', 'TN_conc', 'sample', 'Site', 'Position', 'Depth', 'date.collected', 'Replicate', 'Month_parafac_meta', 'order_parafac_meta', 'date', 'fi_parafac_meta', 'hix_parafac_meta', 'mhix_parafac_meta'], axis=1, inplace=True)
    moleculesAsDF.drop(columns=moleculesAsDF.columns[moleculesAsDF.sum()==0], inplace=True)
    moleculesAsDF=moleculesAsDF.dropna(axis='rows')
    moleculesAsDF.set_index('ICBM_Code')

In [4]:
#Get names of relevant columns
fluRegex = 'Comp.' + '[0-9]+'
molRegex = '([A-Z]+[a-z]?[0-9]*)+'
allColNames = [x for x in moleculesAsDF.columns]
allFluStrings = [x for x in allColNames if re.fullmatch(fluRegex,x)!=None]
allMolStrings = [x for x in allColNames if re.fullmatch(molRegex,x)!=None]

In [5]:
#Returns a dictionary mapping strings of fluorescence names to a tuple of (molecule, correlation, p value)
def getCorrDict(molDF, pThreshold):
    toReturn = defaultdict(lambda : [])
  # Correlate each molecule with each fluorescence signal
    for fluString in allFluStrings:
        print("\nComputing correlation for", fluString, '\n\tDone with: ', end = '')
        for molInd, molString in enumerate(allMolStrings):
            if molInd % 2000 == 0:
                print(str(molInd) +', ', end='')
        # print(len(allMolStrings))
            fluData = np.array(molDF[fluString])
            molData = np.array(molDF[molString])
      # The p value is for the null hypothesis that the inputs ARE NOT correlate
            rho, pVal = stats.spearmanr(fluData, molData)
            if pVal <= pThreshold:
                toReturn[fluString].append((molString, rho, pVal))
        # print(rho)
        print( end='')
        print(' all', molInd, 'molecules; Found', len(toReturn[fluString]), 'correlated molecules', end = '')
      
    return toReturn


In [6]:
#P value to threshold on null hypothesis that things ARE NOT correlated; will take everything with p value LESS than this
pThresh = .00001
corrDict = getCorrDict(moleculesAsDF, pThresh)
print(corrDict)



Computing correlation for Comp.1 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 284 correlated molecules
Computing correlation for Comp.2 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 3 correlated molecules
Computing correlation for Comp.3 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 1804 correlated molecules
Computing correlation for Comp.4 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 1346 correlated molecules
Computing correlation for Comp.5 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 1162 correlated molecules
Computing correlation for Comp.6 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 0 correlated molecules
Computing correlation for Comp.7 
	Done with: 0, 2000, 4000, 6000,  all 7393 molecules; Found 9 correlated moleculesdefaultdict(<function getCorrDict.<locals>.<lambda> at 0x7efe7cce0310>, {'Comp.1': [('C10H4O4', 0.352376219319548, 2.147412385403279e-08), ('C10H4O5', 0.37766368100518405

In [7]:
allDFsDict = dict()

# Code to format correlations as data frames
for fluString in allFluStrings:
    allTuples = corrDict[fluString]
    allMoleculeStrings = [x[0] for x in allTuples]
    allCorStrings = [x[1] for x in allTuples]
    allPStrings = [x[2] for x in allTuples]

    d = {'molecule': allMoleculeStrings, 'rho': allCorStrings, 'p_val': allPStrings}
    flurDF = pd.DataFrame(data=d)
    allDFsDict['correlationsFor' + fluString + name+ year+'.csv'] = flurDF

display(allDFsDict)

{'correlationsForComp.1PC7o2020.csv':     molecule       rho         p_val
 0    C10H4O4  0.352376  2.147412e-08
 1    C10H4O5  0.377664  1.611160e-09
 2    C10H4O6  0.362300  7.982373e-09
 3    C10H4O7  0.349528  2.834998e-08
 4    C10H4O8  0.299802  2.363880e-06
 ..       ...       ...           ...
 279   C9H6O7  0.327693  2.180257e-07
 280   C9H8O4  0.298510  2.624372e-06
 281   C9H8O5  0.310863  9.455558e-07
 282   C9H8O6  0.322373  3.500389e-07
 283   C9H8O7  0.315837  6.185823e-07
 
 [284 rows x 3 columns],
 'correlationsForComp.2PC7o2020.csv':      molecule       rho     p_val
 0    C18H33NO  0.291076  0.000005
 1  C22H25N3O8  0.298289  0.000003
 2    C9H17NO3  0.298142  0.000003,
 'correlationsForComp.3PC7o2020.csv':        molecule       rho         p_val
 0      C10H10O6  0.323693  3.115174e-07
 1      C10H10O7  0.319648  4.445675e-07
 2      C10H10O8  0.363617  6.981985e-09
 3      C10H10O9  0.283200  8.724789e-06
 4     C10H12O5S -0.284624  7.825542e-06
 ...         ...   

In [8]:
# Write DFs to file
outputDirectory = r'/home/erika/Downloads/CorrInput/CorrOutput/'
for fileString, df in allDFsDict.items(): 
    display(df)
    df.to_csv(outputDirectory + fileString, encoding='utf-8', index=False)


Unnamed: 0,molecule,rho,p_val
0,C10H4O4,0.352376,2.147412e-08
1,C10H4O5,0.377664,1.611160e-09
2,C10H4O6,0.362300,7.982373e-09
3,C10H4O7,0.349528,2.834998e-08
4,C10H4O8,0.299802,2.363880e-06
...,...,...,...
279,C9H6O7,0.327693,2.180257e-07
280,C9H8O4,0.298510,2.624372e-06
281,C9H8O5,0.310863,9.455558e-07
282,C9H8O6,0.322373,3.500389e-07


Unnamed: 0,molecule,rho,p_val
0,C18H33NO,0.291076,5e-06
1,C22H25N3O8,0.298289,3e-06
2,C9H17NO3,0.298142,3e-06


Unnamed: 0,molecule,rho,p_val
0,C10H10O6,0.323693,3.115174e-07
1,C10H10O7,0.319648,4.445675e-07
2,C10H10O8,0.363617,6.981985e-09
3,C10H10O9,0.283200,8.724789e-06
4,C10H12O5S,-0.284624,7.825542e-06
...,...,...,...
1799,C9H8O4,0.405606,7.028360e-11
1800,C9H8O5,0.428605,4.249907e-12
1801,C9H8O6,0.446510,4.110211e-13
1802,C9H8O7,0.424067,7.518853e-12


Unnamed: 0,molecule,rho,p_val
0,C10H10O3,0.341988,5.837535e-08
1,C10H10O4,0.410833,3.784427e-11
2,C10H10O5,0.444695,5.241313e-13
3,C10H10O6,0.472485,1.077606e-14
4,C10H10O7,0.464921,3.212529e-14
...,...,...,...
1341,C9H8O4,0.514523,1.484161e-17
1342,C9H8O5,0.537871,2.532307e-19
1343,C9H8O6,0.529622,1.106087e-18
1344,C9H8O7,0.508390,4.109308e-17


Unnamed: 0,molecule,rho,p_val
0,C10H10O5,-0.301113,2.124670e-06
1,C10H10O6,-0.340838,6.506824e-08
2,C10H10O7,-0.341688,6.005211e-08
3,C10H10O8,-0.391615,3.498804e-10
4,C10H10O9,-0.343077,5.265700e-08
...,...,...,...
1157,C9H8O4,-0.389782,4.293778e-10
1158,C9H8O5,-0.407203,5.823903e-11
1159,C9H8O6,-0.412980,2.925580e-11
1160,C9H8O7,-0.407487,5.632238e-11


Unnamed: 0,molecule,rho,p_val


Unnamed: 0,molecule,rho,p_val
0,C11H22O6,0.291904,4.444924e-06
1,C13H16O7S,0.327508,2.216659e-07
2,C13H18O5S,0.28998,5.16938e-06
3,C14H20O5S,0.281884,9.642462e-06
4,C14H22O5S,0.31254,8.202105e-07
5,C15H22O5S,0.291726,4.50752e-06
6,C17H34O9,0.291927,4.436878e-06
7,C5H9NO2S,0.331169,1.592244e-07
8,C7H5NOS,0.333574,1.278156e-07
