In [2]:
# Merging, sorting, and querying using Pandas
#
# Objectives: 
#    Use pandas operations to perform common SQL operations

In [6]:
import pandas as pd

In [21]:
surveys_df = pd.read_csv("data/surveys.csv")
species_df = pd.read_csv("data/species.csv")

In [8]:
# you can get the head and tail, similar to the LIMIT clause in SQL

In [37]:
surveys_df.head()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
0,1,7,16,1977,2,NL,M,32.0,
1,2,7,16,1977,3,NL,M,33.0,
2,3,7,16,1977,2,DM,F,37.0,
3,4,7,16,1977,7,DM,M,36.0,
4,5,7,16,1977,3,DM,M,35.0,


In [38]:
surveys_df.tail()

Unnamed: 0,record_id,month,day,year,plot_id,species_id,sex,hindfoot_length,weight
35544,35545,12,31,2002,15,AH,,,
35545,35546,12,31,2002,15,AH,,,
35546,35547,12,31,2002,10,RM,F,15.0,14.0
35547,35548,12,31,2002,7,DO,M,36.0,51.0
35548,35549,12,31,2002,5,,,,


In [9]:
# to select only certain columns
# similar to SELECT species_id, weight FROM...

In [10]:
surveys_df[['species_id', 'weight']].head()

Unnamed: 0,species_id,weight
0,NL,
1,NL,
2,DM,
3,DM,
4,DM,


In [None]:
# pandas offers a unique() method similar to SELECT DISTINCT species_id...

In [11]:
pd.unique(surveys_df['species_id'])

array(['NL', 'DM', 'PF', 'PE', 'DS', 'PP', 'SH', 'OT', 'DO', 'OX', 'SS',
       'OL', 'RM', nan, 'SA', 'PM', 'AH', 'DX', 'AB', 'CB', 'CM', 'CQ',
       'RF', 'PC', 'PG', 'PH', 'PU', 'CV', 'UR', 'UP', 'ZL', 'UL', 'CS',
       'SC', 'BA', 'SF', 'RO', 'AS', 'SO', 'PI', 'ST', 'CU', 'SU', 'RX',
       'PB', 'PL', 'PX', 'CT', 'US'], dtype=object)

In [12]:
# describe() gives you a lot of that would take a long time to write with SQL aggregations

In [42]:
surveys_df['weight'].describe()

count    32283.000000
mean        42.672428
std         36.631259
min          4.000000
25%         20.000000
50%         37.000000
75%         48.000000
max        280.000000
Name: weight, dtype: float64

In [None]:
# to select just a single 

In [43]:
surveys_df['weight'].std()

36.63125947458399

In [14]:
# pandas groupby() provides aggregations similar to sql GROUP BY methods 

In [13]:
grouped_data = surveys_df.groupby('species_id')

In [17]:
# once grouped, you'll need to run summary statistics or other method to see the output

In [16]:
grouped_data.describe()['hindfoot_length']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,0.0,,,,,,,
AH,2.0,33.0,2.828427,31.0,32.0,33.0,34.0,35.0
AS,0.0,,,,,,,
BA,45.0,13.0,1.718879,6.0,12.0,13.0,14.0,16.0
CB,0.0,,,,,,,
CM,0.0,,,,,,,
CQ,0.0,,,,,,,
CS,0.0,,,,,,,
CT,0.0,,,,,,,
CU,0.0,,,,,,,


In [19]:
# JOINs
# you can do inner and left SQL joins using pandas

In [23]:
inner_join_df = pd.merge(left=species_df,right=surveys_df,left_on='species_id',right_on='species_id')

In [24]:
inner_join_df

Unnamed: 0,species_id,genus,species,taxa,record_id,month,day,year,plot_id,sex,hindfoot_length,weight
0,AB,Amphispiza,bilineata,Bird,3126,7,21,1980,8,,,
1,AB,Amphispiza,bilineata,Bird,3146,7,21,1980,24,,,
2,AB,Amphispiza,bilineata,Bird,3152,7,21,1980,19,,,
3,AB,Amphispiza,bilineata,Bird,3153,7,21,1980,22,,,
4,AB,Amphispiza,bilineata,Bird,3586,12,15,1980,16,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
34781,US,Sparrow,sp.,Bird,35513,12,31,2002,11,,,
34782,US,Sparrow,sp.,Bird,35528,12,31,2002,13,,,
34783,US,Sparrow,sp.,Bird,35544,12,31,2002,15,,,
34784,ZL,Zonotrichia,leucophrys,Bird,14250,3,20,1988,18,,,


In [32]:
left_join_df = pd.merge(left=species_df,right=surveys_df,left_on='species_id',right_on='species_id', how='left')

In [30]:
left_join_df

Unnamed: 0,species_id,genus,species,taxa,record_id,month,day,year,plot_id,sex,hindfoot_length,weight
0,AB,Amphispiza,bilineata,Bird,3126.0,7.0,21.0,1980.0,8.0,,,
1,AB,Amphispiza,bilineata,Bird,3146.0,7.0,21.0,1980.0,24.0,,,
2,AB,Amphispiza,bilineata,Bird,3152.0,7.0,21.0,1980.0,19.0,,,
3,AB,Amphispiza,bilineata,Bird,3153.0,7.0,21.0,1980.0,22.0,,,
4,AB,Amphispiza,bilineata,Bird,3586.0,12.0,15.0,1980.0,16.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
34787,US,Sparrow,sp.,Bird,35528.0,12.0,31.0,2002.0,13.0,,,
34788,US,Sparrow,sp.,Bird,35544.0,12.0,31.0,2002.0,15.0,,,
34789,ZL,Zonotrichia,leucophrys,Bird,14250.0,3.0,20.0,1988.0,18.0,,,
34790,ZL,Zonotrichia,leucophrys,Bird,14351.0,4.0,17.0,1988.0,23.0,,,
