In [1]:
# Treating a pandas dataframe as a SQL table
#
# Objectives: 
#    Use the pandasql module to run queries on pandas dataframes

In [2]:
import pandas as pd
import pandasql
from pandasql import sqldf

In [4]:
# a short helper function so you don't have to specify locals or globals every time you run a query
pysqldf = lambda q: sqldf(q, globals())

In [6]:
species_df = pd.read_csv("data/species.csv")

In [7]:
# you can now run sql statements using a dataframe as a table name, without having to create a database

In [8]:
pysqldf("SELECT * FROM species_df LIMIT 10")

Unnamed: 0,species_id,genus,species,taxa
0,AB,Amphispiza,bilineata,Bird
1,AH,Ammospermophilus,harrisi,Rodent
2,AS,Ammodramus,savannarum,Bird
3,BA,Baiomys,taylori,Rodent
4,CB,Campylorhynchus,brunneicapillus,Bird
5,CM,Calamospiza,melanocorys,Bird
6,CQ,Callipepla,squamata,Bird
7,CS,Crotalus,scutalatus,Reptile
8,CT,Cnemidophorus,tigris,Reptile
9,CU,Cnemidophorus,uniparens,Reptile


In [None]:
surveys_df = pd.read_csv('data/surveys.csv')

In [9]:
# The result of a pandasql operation on a dataframe is a datframe. 
# So you can switch back and forth between the two

In [15]:
df_taxa = pysqldf("SELECT * FROM species_df WHERE taxa = 'Bird'")

In [17]:
# you may want to reset the index

In [21]:
df_taxa.set_index('species_id', inplace=True)

In [22]:
df_taxa

Unnamed: 0_level_0,genus,species,taxa
species_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AB,Amphispiza,bilineata,Bird
AS,Ammodramus,savannarum,Bird
CB,Campylorhynchus,brunneicapillus,Bird
CM,Calamospiza,melanocorys,Bird
CQ,Callipepla,squamata,Bird
PC,Pipilo,chlorurus,Bird
PG,Pooecetes,gramineus,Bird
PU,Pipilo,fuscus,Bird
SB,Spizella,breweri,Bird
UP,Pipilo,sp.,Bird
