In [1]:
import random
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
import doctable as dt

In [2]:
schema = (
    ('id','integer',dict(primary_key=True, autoincrement=True)),
    ('name','string', dict(nullable=False)),
    ('age','integer'),
)
db = dt.DocTable2(schema, tabname='mydocuments', verbose=True)
# defaults: #fname=':memory:', engine='sqlite', persistent_conn=True, new_db=True
# fname=':memory:' is special - it loads database into memory
print(db)

DocTable2 Query: SELECT count() AS count_1 
FROM mydocuments
 LIMIT :param_1
<DocTable2::mydocuments ct: 0>


In [3]:
N = 10
for i in range(N):
    db.insert({'name':'user_'+str(i), 'age':random.random()}, verbose=False)
print(db)

DocTable2 Query: SELECT count() AS count_1 
FROM mydocuments
 LIMIT :param_1
<DocTable2::mydocuments ct: 10>


In [None]:
## Notes on DB Interface
Just a few notes before looking at selects.

In [92]:
# subscript is used to access underlying sqlite column
db['id']

Column('id', Integer(), table=<mydocuments>, primary_key=True, nullable=False)

In [93]:
# conditionals are applied directly to the column objects (as we'll see with "where" clause)
db['id'] < 3

<sqlalchemy.sql.elements.BinaryExpression object at 0x7f3dc74a8f60>

In [95]:
# can also access using .col() method
db.col('id')

Column('id', Integer(), table=<mydocuments>, primary_key=True, nullable=False)

In [96]:
# to access all column objects (only useful for working directly with sql info)
db.columns

<sqlalchemy.sql.base.ImmutableColumnCollection at 0x7f3dc772f828>

In [48]:
# to access more detailed schema information
db.schemainfo

{'id': {'name': 'id',
  'type': Integer(),
  'comment': None,
  'constraints': set(),
  'expression': Column('id', Integer(), table=<mydocuments>, primary_key=True, nullable=False),
  'foreign_keys': set(),
  'index': None,
  'nullable': False,
  'primary_key': True,
  'onupdate': None,
  'default': None},
 'name': {'name': 'name',
  'type': String(),
  'comment': None,
  'constraints': set(),
  'expression': Column('name', String(), table=<mydocuments>, nullable=False),
  'foreign_keys': set(),
  'index': None,
  'nullable': False,
  'primary_key': False,
  'onupdate': None,
  'default': None},
 'age': {'name': 'age',
  'type': Integer(),
  'comment': None,
  'constraints': set(),
  'expression': Column('age', Integer(), table=<mydocuments>),
  'foreign_keys': set(),
  'index': None,
  'nullable': True,
  'primary_key': False,
  'onupdate': None,
  'default': None}}

## Regular Selects
These functions all return lists of ResultProxy objects. As such, they can be accessed using numerical indices or keyword indices. For instance, if one select output row is ```row=(1, 'user_0')``` (after selecting "id" and "user"), it can be accessed such that ```row[0]==row['id']``` and ```row[1]==row['user']```.

In [4]:
db.select(limit=2)

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments
 LIMIT :param_1


[(1, 'user_0', 0.28558273804205103), (2, 'user_1', 0.34999448378909426)]

In [5]:
db.select([db['id'],db['name']], limit=1)

DocTable2 Query: SELECT mydocuments.id, mydocuments.name 
FROM mydocuments
 LIMIT :param_1


[(1, 'user_0')]

In [6]:
db.select_first()

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments
 LIMIT :param_1


(1, 'user_0', 0.28558273804205103)

In [7]:
db.select(db['name'],limit=5)

DocTable2 Query: SELECT mydocuments.name 
FROM mydocuments
 LIMIT :param_1


['user_0', 'user_1', 'user_2', 'user_3', 'user_4']

In [8]:
db.select_first(db['age'])

DocTable2 Query: SELECT mydocuments.age 
FROM mydocuments
 LIMIT :param_1


0.28558273804205103

## Conditional Selects

In [9]:
db.select(where=db['id']==2)

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.id = :id_1


[(2, 'user_1', 0.34999448378909426)]

In [10]:
db.select(where=db['id']<3)

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.id < :id_1


[(1, 'user_0', 0.28558273804205103), (2, 'user_1', 0.34999448378909426)]

In [11]:
# note parantheses to handle order of ops with overloaded bitwise ops
db.select(where= (db['id']>=2) & (db['id']<=4) & (db['name']!='user_2'))

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.id >= :id_1 AND mydocuments.id <= :id_2 AND mydocuments.name != :name_1


[(2, 'user_1', 0.34999448378909426), (4, 'user_3', 0.8064246411425373)]

In [12]:
db.select(where=db['name'].in_(('user_2','user_3')))

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.name IN (:name_1, :name_2)


[(3, 'user_2', 0.8164593883769352), (4, 'user_3', 0.8064246411425373)]

In [18]:
db.select(where=db['id'].between(2,4))

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.id BETWEEN :id_1 AND :id_2


[(2, 'user_1', 0.34999448378909426),
 (3, 'user_2', 0.8164593883769352),
 (4, 'user_3', 0.8064246411425373)]

In [19]:
# use of logical not operator "~"
db.select(where= ~(db['name'].in_(('user_2','user_3'))) & (db['id'] < 4))

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.name NOT IN (:name_1, :name_2) AND mydocuments.id < :id_1


[(1, 'user_0', 0.28558273804205103), (2, 'user_1', 0.34999448378909426)]

In [20]:
# more verbose operators .and_, .or_, and .not_ are bound to the doctable package
db.select(where= dt.or_(dt.not_(db['id']==4)) & (db['id'] <= 2))

DocTable2 Query: SELECT mydocuments.id, mydocuments.name, mydocuments.age 
FROM mydocuments 
WHERE mydocuments.id != :id_1 AND mydocuments.id <= :id_2


[(1, 'user_0', 0.28558273804205103), (2, 'user_1', 0.34999448378909426)]

In [21]:
# now with simple computation
ages = db.select(db['age'])
mean_age = sum(ages)/len(ages)
db.select(db['name'], where=db['age']>mean_age, limit=2)

DocTable2 Query: SELECT mydocuments.age 
FROM mydocuments
DocTable2 Query: SELECT mydocuments.name 
FROM mydocuments 
WHERE mydocuments.age > :age_1
 LIMIT :param_1


['user_2', 'user_3']

In [29]:
# apply .label() method to columns
dict(db.select_first([db['age'].label('myage'), db['name'].label('myname')]))

DocTable2 Query: SELECT mydocuments.age AS myage, mydocuments.name AS myname 
FROM mydocuments
 LIMIT :param_1


{'myage': 0.28558273804205103, 'myname': 'user_0'}

## Column Operators
I bind the .min, .max, .count, .sum, and .mode methods to the column objects. Additionally, I move the .count method to a separate DocTable2 method.

In [44]:
db.select_first([db['age'].sum, db['age'].count, db['age']])

DocTable2 Query: SELECT sum(mydocuments.age) AS sum_1, count(mydocuments.age) AS count_1, mydocuments.age 
FROM mydocuments
 LIMIT :param_1


(4.904853004729304, 10, 0.28558273804205103)

In [45]:
# with labels now
dict(db.select_first([db['age'].sum.label('sum'), db['age'].count.label('ct')]))

DocTable2 Query: SELECT sum(mydocuments.age) AS sum, count(mydocuments.age) AS ct 
FROM mydocuments
 LIMIT :param_1


{'sum': 4.904853004729304, 'ct': 10}

## Select as Pandas Series and DataFrame
These are especially useful when working with metadata because Pandas provides robust descriptive and plotting features than SQL alone. Good for generating sample information.

In [54]:
# must provide only a single column
db.select_series(db['age']).head(2)

DocTable2 Query: SELECT mydocuments.age 
FROM mydocuments


0    0.285583
1    0.349994
dtype: float64

In [59]:
db.select_series(db['age']).quantile([0.025, 0.985])

DocTable2 Query: SELECT mydocuments.age 
FROM mydocuments


0.025    0.167779
0.985    0.921761
dtype: float64

In [60]:
db.select_df([db['id'],db['age']]).head(2)

DocTable2 Query: SELECT mydocuments.id, mydocuments.age 
FROM mydocuments


Unnamed: 0,id,age
0,1,0.285583
1,2,0.349994


In [64]:
# must provide list of cols (even for one col)
db.select_df([db['id'],db['age']]).corr()

DocTable2 Query: SELECT mydocuments.id, mydocuments.age 
FROM mydocuments


Unnamed: 0,id,age
id,1.0,0.074848
age,0.074848,1.0


In [63]:
db.select_df([db['id'],db['age']]).describe().T

DocTable2 Query: SELECT mydocuments.id, mydocuments.age 
FROM mydocuments


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,10.0,5.5,3.02765,1.0,3.25,5.5,7.75,10.0
age,10.0,0.490485,0.267332,0.133578,0.327476,0.402144,0.717409,0.938195


In [90]:
mean_age = db.select_series(db['age']).mean()
df = db.select_df([db['id'],db['age']])
df['old_grp'] = df['age'] > mean_age
df.groupby('old_grp').describe()

DocTable2 Query: SELECT mydocuments.age 
FROM mydocuments
DocTable2 Query: SELECT mydocuments.id, mydocuments.age 
FROM mydocuments


Unnamed: 0_level_0,id,id,id,id,id,id,id,id,age,age,age,age,age,age,age,age
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
old_grp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
False,7.0,5.714286,3.352327,1.0,3.5,6.0,8.0,10.0,7.0,0.334825,0.105723,0.133578,0.302777,0.349994,0.402144,0.450361
True,3.0,5.0,2.645751,3.0,3.5,4.0,6.0,8.0,3.0,0.853693,0.073353,0.806425,0.811442,0.816459,0.877327,0.938195


In [89]:
# more complicated groupby aggregation.
# calculates the variance both for entries above and below average age
mean_age = db.select_series(db['age']).mean()
df = db.select_df([db['name'],db['age']])
df['old_grp'] = df['age']>mean_age
df.groupby('old_grp').agg(**{
    'first_name':pd.NamedAgg(column='name', aggfunc='first'),
    'var_age':pd.NamedAgg(column='age', aggfunc=np.var),
})

DocTable2 Query: SELECT mydocuments.age 
FROM mydocuments
DocTable2 Query: SELECT mydocuments.name, mydocuments.age 
FROM mydocuments


Unnamed: 0_level_0,first_name,var_age
old_grp,Unnamed: 1_level_1,Unnamed: 2_level_1
False,user_0,0.011177
True,user_2,0.005381


## Count Method and Get Next ID
```.count()``` is a convenience method. Mostly the same could be accomplished by ```db.select_first(db['id'].count)```, but this requires no reference to a specific column.

```.next_id()``` is especially useful if one hopes to enter the id (or any primary key column) into new rows manually. Especially useful because SQL engines don't provide new ids except when a single insert is performed.

In [46]:
db.count()

DocTable2 Query: SELECT count() AS count_1 
FROM mydocuments
 LIMIT :param_1


10

In [47]:
db.count(db['age'] < 0.5)

DocTable2 Query: SELECT count() AS count_1 
FROM mydocuments 
WHERE mydocuments.age < :age_1
 LIMIT :param_1


7

In [49]:
db.next_id()

DocTable2 Query: SELECT max(mydocuments.id) AS max_1 
FROM mydocuments
 LIMIT :param_1


11

In [51]:
# weird (but possible) with 'age' because it's not an actual primary key
db.next_id(idcol='age')

DocTable2 Query: SELECT max(mydocuments.age) AS max_1 
FROM mydocuments
 LIMIT :param_1


1.9381948101512547