/
correlation_calcs.py
executable file
·195 lines (164 loc) · 10 KB
/
correlation_calcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import numpy as np
import pandas as pd
import math as math
import csv
import graphviz
import os
import shutil
def correlation_calcs(flag_values):
# Open prescence/abscence matrix from /01_PA_matrix/ directory
with open(os.path.join(flag_values['output'],'01_PA_matrix','pa_matrix.csv'), 'r') as infile:
# Create dataframe from BLAST results, header = row# w/ column names
pa_df = pd.read_csv(infile, sep=',', header=0)
#clean up indexing and names in our pa_df (presence/absence dataframe)
first_col = pa_df.columns[0]
pa_df = pa_df.rename(columns={first_col : "species"})
#an index of gene_names. Do not include indexer row '0' or 'species' column '1' of pa_df (presence/abscence dataframe)
gene_names = pa_df.dtypes.index[1:]
#this function calculates all the terms needed for pearson correlation
#FUNCTION: calc_pearson_terms(pa_df, gene_names), returns E_i (dict gene:# species containing gene) N is total # of species
#Cij_df contains number of species containing both genes i and j
E_i, N, Cij_df = calc_pearson_terms(flag_values, pa_df, gene_names)
#this function calculates pearson correlation values
#FUNCTION: pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df), returns Rij_inverse_df (inverse matrix of pearson correlations)
Rij_inverse_df = pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df)
# Wij = Partial correlation
Wij_df = pd.DataFrame(index=gene_names, columns=gene_names)
max_rel_df = pd.DataFrame(columns=['source', 'target', 'Wij_value', 'correlogy_group', 'correlogy_dir', 'directed'])
# Create empty maximum relationship dataframe
max_rel_df.source = gene_names
for i, name in enumerate(gene_names):
max_rel_df.loc[i, 'directed']='TRUE'
# Calculate all Wij permutations.
for column_index, column_name in enumerate(gene_names):
# Loop through all columns to the right of current column (column_index), so column_index + 1 = column to right
for column_to_right_index, column_to_right_name in enumerate(gene_names[column_index+1:]):
#print(column_name)
Pij = Rij_inverse_df.loc[column_name, column_to_right_name]
Pii = Rij_inverse_df.loc[column_name, column_name]
Pjj = Rij_inverse_df.loc[column_to_right_name, column_to_right_name]
#print("Pij=", Pij, "Pii=", Pii, "Pjj=", Pjj)
Wij_df.at[column_name, column_to_right_name] = -((Pij) / (math.sqrt(Pii * Pjj)))
Wij_df.to_csv(os.path.join(flag_values['output'],'02_correlation','Wij_df_triangle.csv'))
print("->Wrote 'Wij_df_triangle.csv' to '{}".format(os.path.join(flag_values['output']),"02_correlation..") + '\n')
# Convert Wij_df from a triangle matrix into a symmetrical matrix
for column in gene_names:
for row in gene_names:
# Create main diagonal of 1
if column == row:
Wij_df.loc[column, row]=1
# Transpose rows/columns with columns/rows to fill in bottom triangles
else:
Wij_df.loc[row, column] = Wij_df.loc[column, row]
Wij_df.to_csv(os.path.join(flag_values['output'],'02_correlation','Wij_df.csv'))
print("->Wrote 'Wij_df.csv' to '{}".format(os.path.join(flag_values['output']),"02_correlation..") + '\n')
#need print statement here to console
return Wij_df
def calc_mrs(Wij_df, flag_values):
Wij_outerDict = {}
gene_names = Wij_df.dtypes.index
f = open(os.path.join(flag_values['output'],'02_correlation',"Wij_matrix.csv"), "w")
writer = csv.writer(f)
network_list=[]
for row in gene_names:
#this portion of loop writes Wij_matrix.csv file
#sort columns by a given row highest to lowest
Wij_df = Wij_df.sort_values(by=row, ascending=False, axis=1, na_position='last')
current_columns = Wij_df.columns.tolist()
current_columns.remove(row)
current_columns.insert(0,row)
writer.writerow(current_columns)
current_row = Wij_df.loc[row,:].tolist()
current_row.remove(1)
current_row.insert(0,"")
writer.writerow(current_row)
Wij_outerDict[row] = dict(zip(Wij_df.columns, Wij_df.loc[row,:]))
#this data is for creating visual graphics for maximum related networks
temp_list=[]
temp_list.append(current_columns[0])
temp_list.append(current_columns[1])
temp_list.append(current_row[1])
network_list.append(temp_list)
f.close()
#make copy in 03_visual_ouput folder
shutil.copy(os.path.join(flag_values['output'],'02_correlation',"Wij_matrix.csv"),os.path.join(flag_values['output'],'03_visual_output',"co-occurrence.csv"))
return network_list
########################################################################################################################
#this function calculates all the terms needed for pearson correlation
#FUNCTION: calc_pearson_terms(pa_df, gene_names), returns E_i (dict gene:# species containing gene) N is total # of species
#Cij_df contains number of species containing both genes i and j
def calc_pearson_terms(flag_values, pa_df, gene_names):
# E_i is dictionary(gene:number species containing gene)
E_i = {}
for i in gene_names: #do not include indexer row '0' or 'species' column '1'
E_i.update({i:len(pa_df.loc[pa_df[i] == 1, ['species']])})
N = len(pa_df.species)
# Cij = Number of species containing both i and j.
Cij_df = pd.DataFrame(index=gene_names, columns=gene_names)
# Calculates all Cij permutations.Move through columns L to R, stopping at each column to compare
# current columns gene presence in species (rows) to the following columns gene presence.
# Loop through all columns except last
for column_index, column_name in enumerate(gene_names[0:-1]):
print(column_name)
# Loop through all columns to the right of current column (column_index), so column_index + 1 = column to right
for column_to_right_index, column_to_right_name in enumerate(gene_names[column_index+1:]):
# Cij = # species containing both genes i and j, zeroes out everytime you compare new columns
Cij = 0
# Loop through current column and column to right (provided by above 'for' loop) row by row
# to check if they both have '1' values ie gene is present in both organisms
for current_index, current_item in enumerate(pa_df.loc[:, column_name]):
#print(item1) #testing purposes
if current_item == 1 and pa_df.loc[current_index, column_to_right_name] == 1:
Cij += 1
print(" --->", column_to_right_name, Cij)
Cij_df.at[column_name, column_to_right_name] = Cij
# write Cij_df to disk
cij_dir=os.path.join(flag_values['output'],'02_correlation','Cij_df.csv')
Cij_df.to_csv(cij_dir)
print("Wrote 'Cij_df.csv' aka # species with both gene 'i' & gene 'j' to '{}".format(cij_dir + '\n'))
# Debug print(Cij, index, df.loc[index, "species"])
return E_i, N, Cij_df
########################################################################################################################
#this function calculates pearson correlation values
#FUNCTION: pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df), returns Cij_df (matrix of pearson correlations)
def pearson_corr_calc(flag_values, pa_df, gene_names, E_i, N, Cij_df):
#Rij = Pearson Correlation Values of gene i to j. stored in Rij_df
Rij_df = pd.DataFrame(index=gene_names, columns=gene_names, dtype = float)
# Calculate all Rij permutations. Move through Cij _df columns L to R, stopping at each column to
# calculate current column in relationship to following columns.
# Loop through all columns except last
for column_index, column_name in enumerate(gene_names[0:-1]):
print(column_name)
# Loop through all columns to the right of current column (column_index), so column_index + 1 = column to right
for column_to_right_index, column_to_right_name in enumerate(gene_names[column_index+1:]):
current_Cij = Cij_df.loc[column_name, column_to_right_name]
current_E_i = E_i[column_name]
current_E_j = E_i[column_to_right_name]
Rij_df.at[column_name, column_to_right_name] = ((current_Cij * N) - (current_E_i * current_E_j)) / (
math.sqrt(current_E_i) * math.sqrt(current_E_j) * math.sqrt(N-current_E_i) * math.sqrt(N-current_E_j))
# write Rij_df to disk
rij_triangle_dir=os.path.join(flag_values['output'],'02_correlation','Rij_df_triangle.csv')
Rij_df.to_csv(rij_triangle_dir)
print("->Wrote 'Rij_df_triangle.csv' aka Pearson Correlations to '{}".format(rij_triangle_dir + '\n'))
# Convert Rij_df from a triangle matrix into a symmetrical matrix. We need this to create our inverse matrix later.
for column in gene_names:
for row in gene_names:
# Create main diagonal of 0's
if column == row:
Rij_df.loc[column, row]=1
# Transpose rows/columns with columns/rows to fill in bottom triangles
else:
Rij_df.loc[row, column] = Rij_df.loc[column, row]
# write Rij_df symmetrical to disk
rij_sym_dir=os.path.join(flag_values['output'], '02_correlation', 'Rij_df_symmetrical.csv')
Rij_df.to_csv(rij_sym_dir)
print("->Wrote 'Rij_df_symmetrical.csv' to '{}".format(rij_sym_dir + '\n'))
# Create inverse matrix from symmetrical Rij matrix
Rij_inverse_df = pd.DataFrame(np.linalg.pinv(Rij_df.values), Rij_df.columns, Rij_df.index)
# write Rij_inverse_df to disk
Rij_inverse_dir=os.path.join(flag_values['output'],'02_correlation','Rij_inverse_df.csv')
Rij_inverse_df.to_csv(Rij_inverse_dir)
print("->Wrote 'Rij_inverse_df.csv' to '{}".format(Rij_inverse_dir + '\n'))
# Verify matrix multiplication results in the identity matrix.
# identify_matrix_test = Rij_inverse_df.dot(Rij_df)
return Rij_inverse_df