Use SHIFT+ENTER on each cell to execute it!

# Field Guide to Databases

December 2017

# Key:value stores

In [None]:
import dbm
import re

## grab some data

UNIX-based machines generally have a [`words` file](https://en.wikipedia.org/wiki/Words_(Unix)) (used for spellchecking) in `/usr/share/dict/words` or `/usr/dict/words`.  We'll use that as a source of data.

In this environment, we've copied `words` into the local directory so that you can access it online.

In [2]:
# !head /usr/share/dict/words
!head words

A
a
aa
aal
aalii
aam
Aani
aardvark
aardwolf
Aaron


In [3]:
import re
vowel_pattern = re.compile('[aeiou]')
raw_data = {}
words_path = 'words'  # '/usr/share/dict/words' or '/usr/dict/words' on most machines
with open(words_path) as infile:
    for word in infile.readlines():
        vowels = vowel_pattern.findall(word)
        raw_data[word.rstrip()] = vowels

In [4]:
raw_data['helicopter']

['e', 'i', 'o', 'e']

In [5]:
import dbm
db = dbm.open('vowels.dbm', 'n')

In [6]:
for (k, v) in raw_data.items():
    db[k] = v

TypeError: gdbm mappings have byte or string elements only

In [7]:
for (k, v) in raw_data.items():
    db[k] = str(v)

In [8]:
db['miasma']

b"['i', 'a', 'a']"

In [9]:
eval(db['miasma'])

['i', 'a', 'a']

See also: [shelve](https://docs.python.org/3.1/library/shelve.html)

 # Document db

In [10]:
def word_info(word):
    vowels = vowel_pattern.findall(word)
    result = {
        'raw': word,
        'length': len(word),
        'vowels': vowels,
        'n_vowels': len(vowels),
    }
    return result

In [11]:
word_info('platypus')

{'length': 8, 'n_vowels': 2, 'raw': 'platypus', 'vowels': ['a', 'u']}

# Warning: won't work online

The rest of these cells depend on having MongoDB and neo4j servers running on your machine - in the Binder online environment, those services aren't present, so you'll get errors.  Download this notebook and run it locally to experiment with them!

## MongoDB

Install MongoDB and create a directory for it to save databases to - by default, `/data/db`.

    % sudo mkdir -p /data/db
    Password:
    ~/Dropbox/arguments % whoami
    catherine
    ~/Dropbox/arguments % sudo chown catherine:everyone /data/db
    ~/Dropbox/arguments % mongod


Start the Mongo server daemon.

    % mongod
    2017-12-12T17:44:13.727-0500 I CONTROL  [initandlisten] MongoDB starting : pid=89611 port=27017 dbpath=/data/db 64-bit host=I05EDJ-2LGFH05.netgear.com
    2017-12-12T17:44:13.728-0500 I CONTROL  [initandlisten] db version v3.4.10
    2017-12-12T17:44:13.728-0500 I CONTROL  [initandlisten] git version: 078f28920cb24de0dd479b5ea6c66c644f6326e9
    2017-12-12T17:44:13.728-0500 I CONTROL  [initandlisten] OpenSSL version: OpenSSL 1.0.2m  2 Nov 2017
    2017-12-12T17:44:13.728-0500 I CONTROL  [initandlisten] allocator: system
    2017-12-12T17:44:13.728-0500 I CONTROL  [initandlisten] modules: none
    2017-12-12T17:44:13.728-0500 I CONTROL  [initandlisten] build environment:
    2017-12-12T17:44:13.729-0500 I CONTROL  [initandlisten]     distarch: x86_64
    2017-12-12T17:44:13.729-0500 I CONTROL  [initandlisten]     target_arch: x86_64
    2017-12-12T17:44:13.729-0500 I CONTROL  [initandlisten] options: {}
    2017-12-12T17:44:13.732-0500 I STORAGE  [initandlisten] exception in initAndListen: 29 Data directory /data/db not found., terminating
    2017-12-12T17:44:13.732-0500 I NETWORK  [initandlisten] shutdown: going to close listening sockets...
    2017-12-12T17:44:13.732-0500 I NETWORK  [initandlisten] shutdown: going to flush diaglog...
    2017-12-12T17:44:13.733-0500 I CONTROL  [initandlisten] now exiting
    2017-12-12T17:44:13.733-0500 I CONTROL  [initandlisten] shutting down with code:100


In [12]:
!pip install pymongo



In [13]:
import pymongo

In [14]:
>>> from pymongo import MongoClient
>>> client = MongoClient()

In [15]:
db = client.word_db

### everything created lazily

In [28]:
results = db.word_collection.insert(word_info(word) for word in raw_data)

  """Entry point for launching an IPython kernel.


In [29]:
db.word_collection.find_one()

{'_id': ObjectId('5a3410c01ce5370fea3bcd00'),
 'length': 1,
 'n_vowels': 0,
 'raw': 'A',
 'vowels': []}

In [30]:
list(db.word_collection.find({'vowels': ['u']}))[:10]

[{'_id': ObjectId('5a3410c01ce5370fea3bcede'),
  'length': 5,
  'n_vowels': 1,
  'raw': 'Abrus',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3bcf71'),
  'length': 8,
  'n_vowels': 1,
  'raw': 'Absyrtus',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3bcf77'),
  'length': 3,
  'n_vowels': 1,
  'raw': 'Abu',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3bd480'),
  'length': 5,
  'n_vowels': 1,
  'raw': 'Acrux',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3bd60a'),
  'length': 4,
  'n_vowels': 1,
  'raw': 'Addu',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3be066'),
  'length': 5,
  'n_vowels': 1,
  'raw': 'Aldus',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3be0ff'),
  'length': 5,
  'n_vowels': 1,
  'raw': 'Alfur',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3be3d9'),
  'length': 5,
  'n_vowels': 1,
  'raw': 'Alnus',
  'vowels': ['u']},
 {'_id': ObjectId('5a3410c01ce5370fea3be518'),
  'length': 4,
  

In [31]:
db.word_collection.find_one({'n_vowels': 0})

{'_id': ObjectId('5a3410c01ce5370fea3bcd00'),
 'length': 1,
 'n_vowels': 0,
 'raw': 'A',
 'vowels': []}

In [32]:
db.word_collection.find_one({'vowels': ['i', 'o', 'e', 'a', 'a', 'e']})

{'_id': ObjectId('5a3410c11ce5370fea3d3850'),
 'length': 13,
 'n_vowels': 6,
 'raw': 'inconcealable',
 'vowels': ['i', 'o', 'e', 'a', 'a', 'e']}

In [33]:
list(db.word_collection.find({'vowels': ['i', 'o', 'e', 'a', 'a', 'e']}))

[{'_id': ObjectId('5a3410c11ce5370fea3d3850'),
  'length': 13,
  'n_vowels': 6,
  'raw': 'inconcealable',
  'vowels': ['i', 'o', 'e', 'a', 'a', 'e']},
 {'_id': ObjectId('5a3410c11ce5370fea3d3873'),
  'length': 13,
  'n_vowels': 6,
  'raw': 'incongealable',
  'vowels': ['i', 'o', 'e', 'a', 'a', 'e']}]

In [34]:
db.word_collection.count()

235886

In [35]:
db.word_collection.find().count()

235886

In [36]:
list(db.word_collection.find({'n_vowels': {'$gt': 10}}))

[{'_id': ObjectId('5a3410c01ce5370fea3be757'),
  'length': 21,
  'n_vowels': 11,
  'raw': 'aminoacetophenetidine',
  'vowels': ['a', 'i', 'o', 'a', 'e', 'o', 'e', 'e', 'i', 'i', 'e']},
 {'_id': ObjectId('5a3410c11ce5370fea3de4fe'),
  'length': 20,
  'n_vowels': 11,
  'raw': 'palaeometeorological',
  'vowels': ['a', 'a', 'e', 'o', 'e', 'e', 'o', 'o', 'o', 'i', 'a']},
 {'_id': ObjectId('5a3410c11ce5370fea3df976'),
  'length': 23,
  'n_vowels': 11,
  'raw': 'pericardiomediastinitis',
  'vowels': ['e', 'i', 'a', 'i', 'o', 'e', 'i', 'a', 'i', 'i', 'i']},
 {'_id': ObjectId('5a3410c31ce5370fea3f6512'),
  'length': 22,
  'n_vowels': 11,
  'raw': 'zoologicoarchaeologist',
  'vowels': ['o', 'o', 'o', 'i', 'o', 'a', 'a', 'e', 'o', 'o', 'i']}]

In [37]:
%time
db.word_collection.find({'vowels': ['u', 'o']}).count()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


517

In [38]:
result = db.profiles.create_index([('vowels', pymongo.ASCENDING)],
                                 unique=False)

In [39]:
%time
db.word_collection.find({'vowels': ['e', 'e']}).count()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


2020

In [None]:
db.drop_collection('word_collection')

# Graph databases: Neo4j

    % brew install neo4j
    % neo4j start
    % open http://localhost:7474