From 06d6fe7e83d042495a2f0034b369882b4d54e569 Mon Sep 17 00:00:00 2001 From: David Dotson Date: Sun, 4 Jan 2015 12:00:02 -0700 Subject: [PATCH] Finished Sim docs for now. Moving on to Group. --- MDSynthesis/Containers.py | 7 +- MDSynthesis/Core/Aggregators.py | 2 +- docs/Group.rst | 10 +- docs/Sim.rst | 159 ++++++++++++++++++++++++-------- docs/data.rst | 154 +++++++++++++++++++++++++++++++ docs/getting_started.rst | 61 ------------ docs/index.rst | 4 +- 7 files changed, 287 insertions(+), 110 deletions(-) create mode 100644 docs/data.rst delete mode 100644 docs/getting_started.rst diff --git a/MDSynthesis/Containers.py b/MDSynthesis/Containers.py index ff296e6..cb5d357 100644 --- a/MDSynthesis/Containers.py +++ b/MDSynthesis/Containers.py @@ -401,10 +401,11 @@ def __init__(self, group, members=None, location='.', coordinator=None, categori directory to place Group object; default is current directory *coordinator* directory of the Coordinator to associate with this object; if the - Coordinator does not exist, it is created [``None``] + Coordinator does not exist, it is created; if ``None``, the Sim + will not associate with any Coordinator *categories* - dictionary with user-defined keys and values; basically used to - give Groups distinguishing characteristics + dictionary with user-defined keys and values; used to give + Groups distinguishing characteristics *tags* list with user-defined values; like categories, but useful for adding many distinguishing descriptors diff --git a/MDSynthesis/Core/Aggregators.py b/MDSynthesis/Core/Aggregators.py index f04b30d..dfc7a94 100644 --- a/MDSynthesis/Core/Aggregators.py +++ b/MDSynthesis/Core/Aggregators.py @@ -541,7 +541,7 @@ def define(self, handle): return self._containerfile.get_selection(self._container._uname, handle) def copy(self, universe): - """Copy defined selections of another universe to the attached universe. + """Copy defined selections of another universe to the active universe. :Arguments: *universe* diff --git a/docs/Group.rst b/docs/Group.rst index f34ad07..656bd5c 100644 --- a/docs/Group.rst +++ b/docs/Group.rst @@ -31,8 +31,14 @@ directory where the state file lives :: This Group instance will give access to its members and stored data as before. -Reference -========= +Reference: Group +================ .. autoclass:: MDSynthesis.Group :members: :inherited-members: + +Reference: Members +================== +.. autoclass:: MDSynthesis.Core.Aggregators.Members + :members: + :inherited-members: diff --git a/docs/Sim.rst b/docs/Sim.rst index f5adb2c..22391d4 100644 --- a/docs/Sim.rst +++ b/docs/Sim.rst @@ -1,54 +1,131 @@ -=========================== -Using Sims to organize data -=========================== +================================== +Using Sims to dissect trajectories +================================== -A Sim is a Container with all the machinery required to handle trajectories and -the data generated from them in an organized fashion. +**Sim** objects are designed to store datasets that were obtained from a single +simulation, and they give a direct interface to trajectory data by way of the +`MDAnalysis `__ **Universe** object. -To generate a Sim from scratch, we need only give it a name. This will be used -to distinguish the Sim from other Sims, though it need not be unique. We can +To generate a **Sim** from scratch, we need only give it a name. This will be used +to distinguish the **Sim** from others, though it need not be unique. We can also give it a topology and/or trajectory files as we would to an MDAnalysis -Universe :: +**Universe** :: - s = Sim('fluffy', universe=[topology, trajectory]) + >>> s = Sim('fluffy', universe=['path/to/topology', 'path/to/trajectory']) -This will create a directory ``name`` that contains a single file (``Sim.h5``). -That file is a persistent representation of the Sim on disk. We can access -trajectory data by way of an MDAnalysis Universe :: +This will create a directory ``fluffy`` that contains a single file +(``Sim.h5``). That file is a persistent representation of the **Sim** on disk. +We can access trajectory data by way of :: - s.universe + >>> s.universe + -It can also store selections by giving the usual inputs to +The **Sim** can also store selections by giving the usual inputs to ``Universe.selectAtoms`` :: - s.selections.add('backbone', ['name CA', 'name C', 'name O1', 'name O2']) - -And the AtomGroup can be conveniently obtained with :: - - s.selections['backbone'] - -The Sim can also store custom data structures. These can be pandas objects -(e.g. Series, DataFrame, Panel), numpy arrays, or other python objects :: - - a = np.random.randn(100, 100) - s.data.add('randomdata', a) - -This can be recalled later with :: - - s.data['randomdata'] - -The real strength of the Sim is how it stores its information. Generating an -object from scratch stores the information needed to re-generate it in the -filesystem. To generate another instance of the same Sim, simply give the -directory where the state file lives :: - - s2 = Sim('fluffy/') + >>> s.selections.add('backbone', 'name CA', 'name N', 'name C') + +And the **AtomGroup** can be conveniently obtained with :: + + >>> s.selections['backbone'] + + +.. note:: Only selection strings are stored, not the resulting atoms of those + selections. This means that if the topology on disk is replaced + or altered, the results of particular selections may change. + +Multiple Universes +================== +Often it is necessary to post-process a simulation trajectory to get it into a +useful form for analysis. This may involve coordinate transformations that +center on a particular set of atoms or fit to a structure, removal of water, +skipping of frames, etc. This can mean that for a given simulation, multiple +versions of the raw trajectory may be needed. + +For this reason, a **Sim** can store multiple **Universe** definitions. To add +a definition, we need a topology and a trajectory file :: + + >>> s.universes.add('anotherU', 'path/to/topology', 'path/to/trajectory') + >>> s.universes + + +and we can make this the active **Universe** with :: + + >>> s.universes['anotherU'] + >>> s + + +Only a single **Universe** may be active at a time. Atom selections that are +stored correspond to the currently active **Universe**, since different +selection strings may be required to achieve the same selection under a +different **Universe** definition. For convenience, we can copy the selections +corresponding to another **Universe** to the active **Universe** with :: + + >>> s.selections.copy('main') + +Need two **Universe** definitions to be active at the same time? Re-generate a +second **Sim** instance from its representation on disk and activate the desired +**Universe**. + +Resnums can also be stored +========================== +Depending on the simulation package used, it may not be possible to have the +resids of the protein match those given in, say, the canonical PDB structure. +This can make selections by resid cumbersome at best. For this reason, residues +can also be assigned resnums. + +For example, say the resids for the protein in our **Universe** range from 1 to 214, +but they should actually go from 10 to 223. If we can't change the topology to reflect +this, we could set the resnums for these residues to the canonical values :: + + >>> prot = s.universe.selectAtoms('protein') + >>> prot.residues.set_resnum(prot.residues.resids() + 9) + >>> prot.residues.resnums() + array([ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, + 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, + 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, + 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, + 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, + 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, + 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, + 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, + 218, 219, 220, 221, 222, 223]) + +We can now select residue 95 from the PDB structure with :: + + >>> s.universe.selectAtoms('protein and resnum 95') + +and we might save selections using resnums as well. However, resnums aren't +stored in the topology, so to avoid having to reset resnums manually each time +we load the **Universe**, we can just store the resnum definition with :: + + >>> s.universes.resnums('main', s.universe.residues.resnums()) + +and the resnum definition will be applied to the **Universe** both now and every +time it is activated. + +Reference: Sim +============== +.. autoclass:: MDSynthesis.Sim + :members: + :inherited-members: -This Sim instance will give access to the universe, stored selections, and -stored data as before. +Reference: Universes +==================== +.. autoclass:: MDSynthesis.Core.Aggregators.Universes + :members: + :inherited-members: -Reference -========= -.. autoclass:: MDSynthesis.Sim +Reference: Selections +===================== +.. autoclass:: MDSynthesis.Core.Aggregators.Selections :members: :inherited-members: diff --git a/docs/data.rst b/docs/data.rst new file mode 100644 index 0000000..1e8a3ca --- /dev/null +++ b/docs/data.rst @@ -0,0 +1,154 @@ +======================= +Datasets and Containers +======================= +MDSynthesis is not an analysis code. On its own, it does not produce output +data given raw simulation data as input. Its scope is limited to the boring +but tedious task of data management and storage. It is intended to bring +value to analysis results by making them easily accessible now and later. + +As such, the basic functionality of MDSynthesis is condensed into only two +objects, sometimes referred to as *Containers* in the documentation. These are +the :doc:`Sim ` and :doc:`Group ` objects. + +In brief, a **Sim** is designed to manage and give access to the data corresponding +to a single simulation (the raw trajectory(s), as well as analysis results); a +**Group** gives access to any number of **Sim** or other **Group** objects +it has as members (including perhaps itself), and can store analysis results +that pertain to these members collectively. Both types of Container store +their underlying data persistently to disk on the fly. The file locking needed +for each transaction is handled automatically, so more than one python process +can be working with any number of instances of the same Container at the same +time. + +.. warning:: As usual, file locking is process safe, but not thread safe. Don't + use multithreading and try to modify Container elements with them. + +Persistence as a feature +======================== +Containers store their data as directory structures in the file system. Generating +a new **Sim**, for example, with the following :: + + >>> # python session 1 + >>> import MDSynthesis as mds + >>> s = mds.Sim('marklar') + +creates a directory called ``marklar`` in the current working directory. It contains +a single file at the moment :: + + > # shell + > ls marklar + Sim.h5 + +This is the state file containing all the information needed to regenerate an +identical instance of this **Sim**. In fact, we can open a separate python +session (go ahead!) and regenerate this **Sim** immediately there :: + + >>> # python session 2 + >>> import MDSynthesis as mds + >>> s = mds.Sim('marklar') + +Making a modification to the **Sim** in one session, perhaps by adding a tag, +will be reflected in the **Sim** in the other session :: + + >>> # python session 1 + >>> s.tags.add('TIP4P') + + >>> # python session 2 + >>> s.tags + + +This is because both objects pull their identifying information from the same +file on disk; they store almost nothing in memory. + +Storing arbitrary datasets +========================== +More on things like tags later, but we really care about storing (potentially +large and time consuming to produce) datasets. Using our **Sim** ``marklar`` +as the example here, say we have generated a numpy array of dimension +(10^6, 3) that gives the minimum distance between the sidechains of three +residues with those of a fourth for each frame in a trajectory :: + + >>> a.shape + (1000000, 3) + +We can store this easily :: + + >>> s.data.add('distances', a) + >>> s.data + + +and recall it :: + + >>> s.data['distances'].shape + (1000000, 3) + +Looking at the contents of the directory ``marklar``, we see it has a new +subdirectory corresponding to the name of our stored dataset :: + + > # shell + > ls marklar + distances Sim.h5 + +which has its own contents :: + + > ls marklar/distances + npData.h5 + +This is the data we stored, serialized to disk in the efficient `HDF5 +`__ data format. Containers will also +store `pandas `__ objects using this format. +For other data structures, the Container will pickle them if it can. + +Datasets can be nested however you like. For example, say we had several +pandas **DataFrames** each giving the distance with time of each cation in the +simulation with respect to some residue of interest on our protein. We +could just as well make it clear to ourselves that these are all similar +datasets by grouping them together :: + + >>> s.data.add('cations/residue1', df1) + >>> s.data.add('cations/residue2', df2) + >>> # we can also use setitem syntax + >>> s.data['cations/residue3'] = df3 + >>> s.data + + +and their locations in the filesystem reflect this structure. + +Minimal blobs +============= +Individual datasets get their own place in the filesystem instead of all being +shoved into a single file on disk. This is by design, as it generally means +better performance since individual data files means less waiting for file +locks to release from other Container instances. Also, it gives a space to put +other files related to the dataset itself, such as figures produced from it. + +You can get the location on disk of a dataset with :: + + >>> s.data.locate('cations/residue1') + '/home/bob/marklar/cations/residue1' + +which is particularly useful for outputting figures. + +Another advantage of organizing Containers at the filesystem level is that +datasets can be handled at the filesystem level. Removing a dataset with a :: + + > # shell + > rm -r marklar/cations/residue2 + +is immediately reflected by the Container :: + + >>> s.data + + +Datasets can likewise be moved and they will still be found by the Container. + +Reference: Data +=============== +The class :class:`MDSynthesis.Core.Aggregators.Data` is the interface used +by Containers to access their stored datasets. It is not intended to be used +on its own, but is shown here to give a detailed view of its methods. + +.. autoclass:: MDSynthesis.Core.Aggregators.Data + :members: + :inherited-members: diff --git a/docs/getting_started.rst b/docs/getting_started.rst deleted file mode 100644 index 04fccf2..0000000 --- a/docs/getting_started.rst +++ /dev/null @@ -1,61 +0,0 @@ -=============== -Getting started -=============== -MDSynthesis is not an analysis code. On its own, it does not produce output -data given raw simulation data as input. Its scope is limited to the boring -but tedious task of data management and storage. It is intended to bring -value to analysis results by making them easily accessible now and later. - -As such, the basic functionality of MDSynthesis is condensed into only two -objects, sometimes referred to as *Containers* in the documentation. These are -the :doc:`Sim ` and :doc:`Group ` objects. - -In brief, a **Sim** is designed to manage and give access to the data corresponding -to a single simulation (the raw trajectory(s), as well as analysis results); a -**Group** gives access to any number of **Sim** or other **Group** objects -it has as members (including perhaps itself), and can store analysis results -that pertain to these members collectively. Both types of Container store -their underlying data persistently to disk on the fly. The file locking needed -for each transaction is handled automatically, so more than one python session -can be working with any number of instances of the same Container at the same -time. - -Persistence as a feature -======================== - -Containers store their data as directory structures in the file system. Generating -a new **Sim**, for example, with the following :: - - >>> # python session 1 - >>> import MDSynthesis as mds - >>> s = mds.Sim('marklar') - -creates a directory called ``marklar`` in the current working directory. It contains -a single file at the moment :: - - > # shell - > ls marklar - Sim.h5 - -This is the state file containing all the information needed to regenerate an -identical instance of this **Sim**. In fact, we can open a separate python -session (go ahead!) and regenerate this **Sim** immediately there :: - - >>> # python session 2 - >>> import MDSynthesis as mds - >>> s = mds.Sim('marklar') - -Making a modification to the **Sim** in one session, perhaps by adding a tag, -will be reflected in the **Sim** in the other session :: - - >>> # python session 1 - >>> s.tags.add('TIP3P') - - >>> # python session 2 - >>> s.tags - - -This is because both objects pull their identifying information from the same -file on disk; they store almost nothing in memory. - - diff --git a/docs/index.rst b/docs/index.rst index cb6811f..37274c4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -82,10 +82,10 @@ Documentation .. toctree:: :maxdepth: 1 - getting_started + data Sim Group - Containers + tags-categories Coordinator Misc