# Your code doesn't need comments, it needs refactoring!

## Load invoices

In [None]:
import pandas as pd

invoices = pd.read_csv('~/Desktop/gen/historic/invoices.csv')

In [None]:
required_columns = {
    'id':                'string',
    'buyer_supplier_id': 'string',
    'gross_amount':      'float64',
    'issue_date':        'datetime64[ns]',
}

In [None]:
invoices = invoices[required_columns]

In [None]:
invoices = invoices.astype(required_columns)

In [None]:
invoices.head()

## Calculate monthly maximum per supplier

In [None]:
max_monthly = invoices
max_monthly.set_index('issue_date', inplace=True)  
max_monthly.head()

In [None]:
max_monthly = max_monthly.groupby('buyer_supplier_id').resample('M')['gross_amount'].sum()
max_monthly.head()

In [None]:
max_monthly = max_monthly.groupby('buyer_supplier_id').max()
max_monthly.head()

In [None]:
max_monthly = pd.DataFrame(max_monthly)
max_monthly.rename(columns={'gross_amount': 'max_monthly'}, inplace=True)
max_monthly.head()

## Combine invoices and monthly maxima

In [None]:
combined = invoices.merge(max_monthly, left_on='buyer_supplier_id', right_index=True)
combined.sample(5)

In [None]:
combined['relative_size'] = combined['gross_amount'] / combined['max_monthly']
combined.sample(5)

## What went wrong?

**Why is the index `issue_date`??**

The culprit:

```python
max_monthly = invoices
max_monthly.set_index('issue_date', inplace=True)    
```

# Enter the Zen of Python!

* Mutability can be suprising
* Surprises are bad
* Implementation comments are failures

# Refactor!
* Don't use `inplace` or direct assignments
* Encapsulate logic in small, pure functions
* Name your functions precisely

In [None]:
def load_invoices():
    return (pd.read_csv('~/Desktop/gen/historic/invoices.csv')
            .get(required_columns)
            .astype(required_columns))

In [None]:
def aggregate_monthly_by_supplier(df):
    return (pd.DataFrame(df
                         .set_index('issue_date')
                         .groupby('buyer_supplier_id').resample('M')['gross_amount'].sum()
                         .groupby('buyer_supplier_id').max())
            .rename(columns={'gross_amount': 'max_monthly'}))


In [None]:
def combine_invoices_and_aggregates(invoices, aggregates):
    def calc_relative_size(df):
        return df['gross_amount'] / df['max_monthly']
        
    
    return (invoices.merge(aggregates, 
                           left_on='buyer_supplier_id', 
                           right_index=True)
            .assign(relative_size=calc_relative_size)
            .set_index('id'))

In [None]:
invoices = load_invoices()

combine_invoices_and_aggregates(
    invoices,
    aggregate_monthly_by_supplier(invoices)
)

# Good code is self-documenting!