In [7]:
# taken from https://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html
import pandas as pd
import numpy as np

In [5]:
url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv'
tips = pd.read_csv(url)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
# select
tips[['total_bill', 'tip', 'smoker', 'time']].head(5)

Unnamed: 0,total_bill,tip,smoker,time
0,16.99,1.01,No,Dinner
1,10.34,1.66,No,Dinner
2,21.01,3.5,No,Dinner
3,23.68,3.31,No,Dinner
4,24.59,3.61,No,Dinner


In [18]:
# WHERE
is_dinner = tips['time'] == 'Dinner'
#is_dinner.value_counts()
tips[is_dinner].head(5)

#short:
#tips[tips['time'] == 'Dinner'].head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [26]:
# WHERE with muliple conditions, & := and, | := or
tips[(tips['tip'] > 5.00) & (tips['time'] == 'Dinner')].head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
44,30.4,5.6,Male,No,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4


In [36]:
# data frame sizes
print( 'all entries:', tips.size, 'shape:', tips.shape, 'len:', len(tips))

all entries: 1708 shape: (244, 7) len: 244


In [37]:
# group by
tips.groupby('sex').size()

sex
Female     87
Male      157
dtype: int64

In [40]:
# count() returns non null entries for each column
tips.groupby('sex').count()
# but can be applied to single column
tips.groupby('sex')['total_bill'].count()

sex
Female     87
Male      157
Name: total_bill, dtype: int64

In [42]:
# group by multiple columns and use multiple aggregations
tips.groupby(['smoker','day']).agg({'tip': [np.mean, np.size]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,size
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2
No,Fri,2.8125,4.0
No,Sat,3.102889,45.0
No,Sun,3.167895,57.0
No,Thur,2.673778,45.0
Yes,Fri,2.714,15.0
Yes,Sat,2.875476,42.0
Yes,Sun,3.516842,19.0
Yes,Thur,3.03,17.0


In [51]:
# joins/merge
df1 = pd.DataFrame({'key': ['A','B','C','D'], 'value': np.random.randn(4)})
df2 = pd.DataFrame({'key': ['B','D','D','E'], 'value': np.random.randn(4)})
df1.head()

Unnamed: 0,key,value
0,A,-0.65535
1,B,-1.049038
2,C,0.598969
3,D,0.366271


In [52]:
# inner join
pd.merge(df1, df2, on='key')

Unnamed: 0,key,value_x,value_y
0,B,-1.049038,-0.814473
1,D,0.366271,-0.636082
2,D,0.366271,-0.472861


In [53]:
# left join, right join analog
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,value_x,value_y
0,A,-0.65535,
1,B,-1.049038,-0.814473
2,C,0.598969,
3,D,0.366271,-0.636082
4,D,0.366271,-0.472861


In [58]:
# union all
df3 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], 'rank': range(1, 4)})
df4 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], 'rank': [1, 4, 5]})
pd.concat([df3, df4])

Unnamed: 0,city,rank
0,Chicago,1
1,San Francisco,2
2,New York City,3
0,Chicago,1
1,Boston,4
2,Los Angeles,5


In [59]:
pd.concat([df3, df4]).drop_duplicates()

Unnamed: 0,city,rank
0,Chicago,1
1,San Francisco,2
2,New York City,3
1,Boston,4
2,Los Angeles,5


In [61]:
tips.nlargest(10, columns='tip')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
23,39.42,7.58,Male,No,Sat,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
141,34.3,6.7,Male,No,Thur,Lunch,6
183,23.17,6.5,Male,Yes,Sun,Dinner,4
214,28.17,6.5,Female,Yes,Sat,Dinner,3
47,32.4,6.0,Male,No,Sun,Dinner,4
239,29.03,5.92,Male,No,Sat,Dinner,3
88,24.71,5.85,Male,No,Thur,Lunch,2


In [63]:
# partition
(tips.assign(rn=tips.sort_values(['total_bill'], ascending = False).groupby(['day']).cumcount() + 1)
    .query('rn<3')
    .sort_values(['day', 'rn'])
)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,rn
95,40.17,4.73,Male,Yes,Fri,Dinner,4,1
90,28.97,3.0,Male,Yes,Fri,Dinner,2,2
170,50.81,10.0,Male,Yes,Sat,Dinner,3,1
212,48.33,9.0,Male,No,Sat,Dinner,4,2
156,48.17,5.0,Male,No,Sun,Dinner,6,1
182,45.35,3.5,Male,Yes,Sun,Dinner,3,2
197,43.11,5.0,Female,Yes,Thur,Lunch,4,1
142,41.19,5.0,Male,No,Thur,Lunch,5,2
