-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_opticgroups_from_mdoc.py
executable file
·175 lines (138 loc) · 5.84 KB
/
create_opticgroups_from_mdoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python3
import sys
import os
import numpy as np
import argparse
from copy import deepcopy
from os import listdir
from metadata import MetaData
from os.path import isfile, join
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
parser = argparse.ArgumentParser(
description="Clusters beam-shifts extracted from mdoc files into opticgroups.")
add = parser.add_argument
add('--i', help="Input mdoc directory.")
add('--istar', default="", help="Input particles star file.")
add('--o', default="", help="Output star file. If empty no file generated generated.")
add('--o_shifts', default="",
help="Output file with extracted beam-shifts and cluster numbers. If empty no file generated generated.")
add('--clusters', type=str, default="1",
help="Number of clusters the beam-shifts should be divided in. (default: 1)")
add('--elbow', type=str, default="0",
help="Number of max clusters used in Elbow method optimal cluster number determination. (default: 0)")
add('--max_iter', type=str, default="300",
help="Expert option: Maximum number of iterations of the k-means algorithm for a single run. (default: 300)")
add('--n_init', type=str, default="10",
help="Expert option: Number of time the k-means algorithm will be run with different centroid seeds. (default: 10)")
args = parser.parse_args()
try:
clusters = int(args.clusters)
elbow = int(args.elbow)
max_iter = int(args.max_iter)
n_init = int(args.n_init)
except ValueError:
print("--clusters, --elbow, --max_iter and --n_init require integer values for comparison.")
sys.exit(2)
if len(sys.argv) == 1:
parser.print_help()
sys.exit(2)
if not args.i:
print("No input file given.")
parser.print_help()
sys.exit(2)
if not args.o:
print("No output file given. No star file will be saved.")
if not os.path.exists(args.i):
print("Input directory '%s' not found."
% args.i)
sys.exit(2)
def elbowMethod(maxClusters, inputArray, maxIter, nInit):
wcss = []
for i in range(1, maxClusters):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=maxIter, n_init=nInit, random_state=0)
kmeans.fit(inputArray)
wcss.append(kmeans.inertia_)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.plot(range(1, maxClusters), wcss)
plt.show()
def kmeansClustering(nClusters, inputArray, maxIter, nInit):
kmeans = KMeans(n_clusters=nClusters, init='k-means++', max_iter=maxIter, n_init=nInit, random_state=0)
pred_y = kmeans.fit_predict(inputArray)
# make cluster numbering start form 1 not 0
pred_y_1 = [x + 1 for x in pred_y]
# plot the clustering
plt.title('Beam-shifts distribution clustering')
plt.xlabel('Beam-shift X')
plt.ylabel('Beam-shift Y')
plt.scatter(inputArray[:, 0], inputArray[:, 1])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()
return pred_y_1
def saveClusteredShifts(fileName, inputArray, clusterIDs):
clustered_array = np.append(inputArray, np.reshape(np.array(clusterIDs), (-1, 1)), axis=1)
np.savetxt(fileName, clustered_array, delimiter=',', header="beamShiftX, beamShiftY, clusterNr")
def saveStarFile(starFileName, mrcFileNames, pred_y):
with open(starFileName, 'w') as starFile:
starFile.write("data_\n\nloop_\n_rlnMicrographName #1\n_rlnBeamTiltClass #2\n")
for mrcFileName, pred_y_val in zip(mrcFileNames, pred_y):
starFile.write("%s %d\n" % (mrcFileName, pred_y_val))
mdocFiles = [f for f in listdir(args.i) if isfile(join(args.i, f))]
beamShifts = []
mrcFileNames = []
for mdocFileName in mdocFiles:
if ".mdoc" in mdocFileName and "mic" in mdocFileName:
mrcFileNames.append("%smrc" % mdocFileName[0:-8])
mdocFile = open(args.i + "/" + mdocFileName, 'r')
for line in mdocFile:
if "ImageShift" in line:
shiftx = line.split(" ")[2]
shifty = line.split(" ")[3]
mdocFile.close()
beamShifts.append([float(shiftx), float(shifty)])
beamShiftArray = np.array(beamShifts)
if elbow > 0:
elbowMethod(elbow, beamShiftArray, max_iter, n_init)
else:
pred_y = kmeansClustering(clusters, beamShiftArray, max_iter, n_init)
micDic = dict(zip(mrcFileNames, pred_y))
if (not args.o == "") and elbow == 0:
if not args.istar:
print("No particle star input file given. Please, define one by --istar")
sys.exit(2)
# read in particle star file
md = MetaData(args.istar)
particles = []
for particle in md:
particles.append(particle)
# create output star file
mdOut = MetaData()
mdOut.version = "3.1"
mdOut.addDataTable("data_optics", True)
mdOut.addLabels("data_optics", md.getLabels("data_optics"))
mdOut.addDataTable("data_particles", True)
mdOut.addLabels("data_particles", md.getLabels("data_particles"))
# create optics groups
opticGroup = md.data_optics[0]
opticsGroups = []
for opticGroupNr in range(max(pred_y)):
newOpticGroup = deepcopy(opticGroup)
newOpticGroup.rlnOpticsGroup = opticGroupNr+1
newOpticGroup.rlnOpticsGroupName = "opticsGroup"+str(opticGroupNr+1)
opticsGroups.append(newOpticGroup)
mdOut.addData("data_optics", opticsGroups)
# create particles table
new_particles = []
for particle in particles:
try:
particle.rlnOpticsGroup = micDic[particle.rlnMicrographName.split("/")[-1]]
except:
print("Warning: No mdoc file found for micrograph: %s. Removing particle from the output star." % particle.rlnMicrographName.split("/")[-1])
else:
new_particles.append(particle)
mdOut.addData("data_particles", new_particles)
mdOut.write(args.o)
if (not args.o_shifts == "") and elbow == 0:
saveClusteredShifts(args.o_shifts, beamShiftArray, pred_y)