# Idomatic pandas
* "a group of words established by usage as having a meaning not deducible from those of the individual words (e.g., rain cats and dogs, see the light )." -- some dictionary
* What does it mean to be idomatic with respect to a programming language?

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('datasets/census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


## Pandas Idiom 1: Method Chaining

In [4]:
(df.where(df['SUMLEV']==50)
    .dropna()
    .set_index(['STNAME','CTYNAME'])
    .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'})).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimates Base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50.0,3.0,6.0,1.0,1.0,54571.0,54571.0,54660.0,55253.0,55175.0,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50.0,3.0,6.0,1.0,3.0,182265.0,182265.0,183193.0,186659.0,190396.0,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50.0,3.0,6.0,1.0,5.0,27457.0,27457.0,27341.0,27226.0,27159.0,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50.0,3.0,6.0,1.0,7.0,22915.0,22919.0,22861.0,22733.0,22642.0,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50.0,3.0,6.0,1.0,9.0,57322.0,57322.0,57373.0,57711.0,57776.0,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


## Pandas Idiom 2: Functional Programming
* We've talked about this at length! Broadcasting! Vectorization!

In [6]:
rows = ['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013','POPESTIMATE2014', 
        'POPESTIMATE2015']

df['abs'] = df.apply(lambda x: np.max(x[rows]) - np.min(x[rows]), axis=1)

## Documentation, IDEs, and Testing

- General functions vs. specific ones: https://pandas.pydata.org/pandas-docs/stable/reference/index.html
- Deepnote Example

In [7]:
df['CENSUS2010POP'].apply

<bound method Series.apply of 0       4779736
1         54571
2        182265
3         27457
4         22915
         ...   
3188      43806
3189      21294
3190      21118
3191       8533
3192       7208
Name: CENSUS2010POP, Length: 3193, dtype: int64>

In [8]:
df['CENSUS2010POP'].groupby

<bound method NDFrame.groupby of 0       4779736
1         54571
2        182265
3         27457
4         22915
         ...   
3188      43806
3189      21294
3190      21118
3191       8533
3192       7208
Name: CENSUS2010POP, Length: 3193, dtype: int64>

In [9]:
pd.cut

<function pandas.core.reshape.tile.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')>

In [10]:
pd.notnull(np.nan)

False

In [11]:
pd.notnull([np.nan, 1, 2])

array([False,  True,  True])

In [12]:
### REPRODUCED FOR REFERENCE
def energy(m, c): ## E = mc^2
    if m >= 0 and c >= 0:
        return m*c**2
    return None

def test_energy():
    assert isinstance(energy(1,1), int), 'should return an int!'
    assert energy(1,1) == 1
    assert energy(2,1) == 2
    assert energy(1,2) == 4
    assert energy(-1, -1) == None
    
test_energy()

 - Example: https://github.com/Liwmo/qwizard
 - Pandas testing: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html

In [13]:
import pandas as pd
df = pd.DataFrame({'item':['banana', 'apple', 'starfruit', 'broccoli', 'cauliflower'], 'price': [0.50, 1.00, 2.00, 1.00, 2.00], 
                   'type':['fruit', 'fruit', 'fruit', 'veg', 'veg']})
df

Unnamed: 0,item,price,type
0,banana,0.5,fruit
1,apple,1.0,fruit
2,starfruit,2.0,fruit
3,broccoli,1.0,veg
4,cauliflower,2.0,veg


In [19]:
def half_off_price(df):
    df['price'] = df['price'] * 0.5
    return df


ans = pd.DataFrame({'item':['banana', 'apple', 'starfruit', 'broccoli', 'cauliflower'], 'price': [0.25, 0.50, 1.00, 0.50, 1.00], 
                   'type':['fruit', 'fruit', 'fruit', 'veg', 'veg']}) 
from pandas import testing
testing.assert_frame_equal(half_off_price(df), ans)

AssertionError: DataFrame.iloc[:, 1] are different

DataFrame.iloc[:, 1] values are different (100.0 %)
[left]:  [0.00625, 0.0125, 0.025, 0.0125, 0.025]
[right]: [0.25, 0.5, 1.0, 0.5, 1.0]

### Refactoring

In [22]:
df = pd.DataFrame({'item':['banana', 'apple', 'starfruit', 'broccoli', 'cauliflower'], 
                   'price': [0.50, 1.00, 2.00, 1.00, 2.00], 
                   'type':['fruit', 'fruit', 'fruit', 'veg', 'veg']})

def get_total_by_type(df):
    return df.groupby('type')['price'].sum()

def get_total_by_type(df):
    fruit_total = 0 
    veg_total = 0
    for i in range(len(df)):
        if df.iloc[i,2] == 'fruit':
            fruit_total += df.iloc[i,1]
        elif df.iloc[i,2] == 'veg':
            veg_total += df.iloc[i,1]
    return pd.Series({'fruit': fruit_total, 'veg':veg_total})

get_total_by_type(df)

fruit    3.5
veg      3.0
dtype: float64

In [None]:
company_df = pd.read_csv('EXAM_DATA_COMPANY.csv')
person_df = pd.read_csv('EXAM_DATA_PERSON.csv')
transact_df = pd.read_csv('EXAM_DATA_TRANSACT.csv')