In [18]:
import pandas as pd
import pandasql

In [19]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [20]:
species_df = pd.read_csv("data/species.csv")

In [22]:
pysqldf("SELECT * FROM species_df LIMIT 10")

Unnamed: 0,species_id,genus,species,taxa
0,AB,Amphispiza,bilineata,Bird
1,AH,Ammospermophilus,harrisi,Rodent
2,AS,Ammodramus,savannarum,Bird
3,BA,Baiomys,taylori,Rodent
4,CB,Campylorhynchus,brunneicapillus,Bird
5,CM,Calamospiza,melanocorys,Bird
6,CQ,Callipepla,squamata,Bird
7,CS,Crotalus,scutalatus,Reptile
8,CT,Cnemidophorus,tigris,Reptile
9,CU,Cnemidophorus,uniparens,Reptile


In [23]:
surveys_df = pd.read_csv('data/surveys.csv')

In [24]:
pysqldf("SELECT * FROM species_df WHERE taxa = 'Bird'")

Unnamed: 0,species_id,genus,species,taxa
0,AB,Amphispiza,bilineata,Bird
1,AS,Ammodramus,savannarum,Bird
2,CB,Campylorhynchus,brunneicapillus,Bird
3,CM,Calamospiza,melanocorys,Bird
4,CQ,Callipepla,squamata,Bird
5,PC,Pipilo,chlorurus,Bird
6,PG,Pooecetes,gramineus,Bird
7,PU,Pipilo,fuscus,Bird
8,SB,Spizella,breweri,Bird
9,UP,Pipilo,sp.,Bird


In [29]:
species_df[species_df['taxa'] == 'Bird'].reset_index()

Unnamed: 0,index,species_id,genus,species,taxa
0,0,AB,Amphispiza,bilineata,Bird
1,2,AS,Ammodramus,savannarum,Bird
2,4,CB,Campylorhynchus,brunneicapillus,Bird
3,5,CM,Calamospiza,melanocorys,Bird
4,6,CQ,Callipepla,squamata,Bird
5,23,PC,Pipilo,chlorurus,Bird
6,26,PG,Pooecetes,gramineus,Bird
7,32,PU,Pipilo,fuscus,Bird
8,39,SB,Spizella,breweri,Bird
9,49,UP,Pipilo,sp.,Bird


In [37]:
# all distinct taxa
species_df[species_df['taxa'] == 'Rabbit']

Unnamed: 0,species_id,genus,species,taxa
38,SA,Sylvilagus,audubonii,Rabbit


In [41]:
pysqldf("SELECT DISTINCT species_id FROM surveys_df WHERE hindfoot_length > 40")

Unnamed: 0,species_id
0,DS
1,DM
2,DO
3,NL
4,PP
5,OT
6,PB


In [48]:
surveys_df[surveys_df['hindfoot_length'] > 40]['species_id'].unique()

array(['DS', 'DM', 'DO', 'NL', 'PP', 'OT', 'PB'], dtype=object)

In [60]:
df_mean = surveys_df.groupby('species_id').describe()['hindfoot_length']['mean']
df_mean[df_mean > 20]

species_id
AH    33.000000
DM    35.982351
DO    35.607551
DS    49.948874
NL    32.294227
OL    20.532609
OT    20.267415
PB    26.115922
PE    20.195545
PH    25.774194
PI    22.222222
PL    20.027778
PM    20.426210
PP    21.751569
SF    26.707317
SH    28.549618
SO    25.658537
Name: mean, dtype: float64

In [67]:
pysqldf("SELECT species_id, AVG(hindfoot_length) as avg_len \
    FROM surveys_df \
    GROUP BY species_id \
    HAVING avg_len > 20")

Unnamed: 0,species_id,avg_len
0,AH,33.0
1,DM,35.982351
2,DO,35.607551
3,DS,49.948874
4,NL,32.294227
5,OL,20.532609
6,OT,20.267415
7,PB,26.115922
8,PE,20.195545
9,PH,25.774194


In [70]:
survey_by_year = surveys_df.groupby('year')

In [74]:
# you can convert it to a list to view the grouping
list(survey_by_year)

[(1977,
       record_id  month  day  year  plot_id species_id  sex  hindfoot_length  \
  0            1      7   16  1977        2         NL    M             32.0   
  1            2      7   16  1977        3         NL    M             33.0   
  2            3      7   16  1977        2         DM    F             37.0   
  3            4      7   16  1977        7         DM    M             36.0   
  4            5      7   16  1977        3         DM    M             35.0   
  5            6      7   16  1977        1         PF    M             14.0   
  6            7      7   16  1977        2         PE    F              NaN   
  7            8      7   16  1977        1         DM    M             37.0   
  8            9      7   16  1977        1         DM    F             34.0   
  9           10      7   16  1977        6         PF    F             20.0   
  10          11      7   16  1977        5         DS    F             53.0   
  11          12      7   16  19

In [81]:
# are hindfoot lengths growing overall over time?
survey_by_year.median()['hindfoot_length']

year
1977    36.0
1978    37.0
1979    36.0
1980    36.0
1981    36.0
1982    35.0
1983    35.0
1984    35.0
1985    35.0
1986    35.0
1987    34.0
1988    27.0
1989    21.0
1990    34.0
1991    22.0
1992    23.0
1993    25.0
1994    36.0
1995    23.0
1996    23.0
1997    23.0
1998    26.0
1999    26.0
2000    25.0
2001    26.0
2002    26.0
Name: hindfoot_length, dtype: float64

In [82]:
survey_by_year.mean()['hindfoot_length']

year
1977    36.276961
1978    37.278880
1979    35.167219
1980    35.082035
1981    35.538830
1982    31.710897
1983    32.390635
1984    31.929385
1985    32.617844
1986    33.057444
1987    29.553377
1988    28.190691
1989    25.792094
1990    28.408658
1991    25.953202
1992    26.859956
1993    27.285935
1994    29.757194
1995    26.915146
1996    26.328422
1997    26.177299
1998    28.202515
1999    28.322160
2000    26.130375
2001    27.140363
2002    27.242353
Name: hindfoot_length, dtype: float64

In [83]:
pysqldf("SELECT year, AVG(hindfoot_length) FROM surveys_df GROUP BY year")

Unnamed: 0,year,AVG(hindfoot_length)
0,1977,36.276961
1,1978,37.27888
2,1979,35.167219
3,1980,35.082035
4,1981,35.53883
5,1982,31.710897
6,1983,32.390635
7,1984,31.929385
8,1985,32.617844
9,1986,33.057444


In [85]:
# let's do the same thing to track worldwide gdp growth. can you do it by year, by continent?

In [None]:
d