In [1]:
from math import pi
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import sqlite3 as sql
import blaze as bz
from odo import odo
from scipy.signal import savgol_filter
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource, vplot, output_server, hplot
from bokeh.models import CustomJS, VBox, HBox, Select, MultiSelect
from bokeh.io import output_file, show, vform, curdoc
from bokeh.charts import BoxPlot
from bokeh.palettes import RdPu9
output_notebook()

# Inserting records into MongoDB collection

In [115]:
import json
from pprint import pprint

with open('../brainspan-data/brainspan.json') as data_file:    
    data = json.load(data_file)

In [2]:
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.brainspan

In [3]:
num_records = db.brainspan2.find().count()
num_records

27445024

In [None]:
db.brainspan2.ensure_index({"gene":1})

In [None]:
for r in data:
    db.brainspan.insert_one(r)
# or, much faster
! mongoimport -d brainspan -c brainspan2 --jsonArray --file brainspan.json 

In [None]:
num_records

In [42]:
cursor = db.brainspan.find({"gene":"MARCH7"})

In [48]:
cursor[523]

u'primary somatosensory cortex (area S1, areas 3,1,2)'

In [49]:
df = pd.DataFrame(list(cursor))

In [93]:
db.brainspan.find_one({"age": "13 pcw", "gene": {"$in": ["WDR4","WDR7"]}})

{u'_id': ObjectId('56de2aa2c5e57333912619e4'),
 u'age': u'13 pcw',
 u'gene': u'WDR7',
 u'rpkm': 10.008071,
 u'structure_name': u'striatum'}

In [99]:
gene = "MARCH7"
query = {"gene":gene, "structure_name": {"$in": structures}}
cursor = db.brainspan.find(query)
cursor.count()

82

In [108]:
structures = [u'occipital neocortex',
 u'primary motor-sensory cortex (samples)',
 u'amygdaloid complex',
 u'medial ganglionic eminence',
 u'posterior (caudal) superior temporal cortex (area 22c)',
 u'upper (rostral) rhombic lip',
 u'caudal ganglionic eminence', 
 u'dorsal thalamus']

In [110]:
categories=[u'8 pcw', u'9 pcw', u'12 pcw', u'13 pcw', u'16 pcw', u'17 pcw', 
        u'19 pcw', u'21 pcw', u'24 pcw', u'25 pcw', u'26 pcw', u'35 pcw',
        u'37 pcw', u'4 mos', u'10 mos', u'1 yrs', u'2 yrs', u'3 yrs',
        u'4 yrs', u'8 yrs', u'11 yrs', u'13 yrs', u'15 yrs', u'18 yrs',
        u'19 yrs', u'21 yrs', u'23 yrs', u'30 yrs', u'36 yrs', u'37 yrs',
        u'40 yrs']
sorterIndex = dict(zip(categories,range(len(categories))))
    
def get_dataframes(gene, structures):
    query = {"gene":gene, "structure_name": {"$in": structures}}
    cursor = db.brainspan.find(query)
    df = pd.DataFrame(list(cursor))
        
    df_line = pd.pivot_table(df, values='rpkm', index='age', aggfunc=np.mean).to_frame().reindex(index=list(df['age'].unique()))
    df_line = df_line.reset_index()
    df_line['rank'] = df_line['age'].map(sorterIndex)
    df_line.sort_values(by='rank', ascending = True, inplace = True)
    df_line.drop('rank', 1, inplace = True)
    if len(structures)%2 == 0:
        window = len(structures) + 1
    else:
        window = len(structures)
    df_line['rpkm_smooth'] = savgol_filter(df_line['rpkm'], window, 3)
    
    return df, df_line

In [111]:
df, df_line = get_dataframes(gene)

In [112]:
df_line

Unnamed: 0,age,rpkm,rpkm_smooth
0,8 pcw,24.131882,23.186397
1,9 pcw,23.190164,26.342021
2,12 pcw,28.487062,25.782786
3,13 pcw,23.622567,22.827994
4,16 pcw,17.211717,18.796946
5,17 pcw,16.686467,15.522231
6,19 pcw,8.467642,15.105746
7,21 pcw,17.679492,16.434608
8,24 pcw,16.986846,17.964237
9,26 pcw,25.998723,17.131682
