# Pandas essentials

- hide: true
- toc: true
- comments: true
- categories: [python, pandas]

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns

Load a sample dataset

In [41]:
cols = {
    'user_id': 'user',
    'transaction_date': 'date',
    'amount': 'amount',
    'transaction_description': 'desc',
    'merchant_name': 'merchant',
    'gender': 'gender',
    'year_of_birth': 'yob',
    'salary_range': 'salary',
}

fp = './data/sample.parquet'
df = pd.read_parquet(fp, columns=cols).rename(columns=cols)
print(df.shape)
df.head(3)

(157287, 8)


Unnamed: 0,user,date,amount,desc,merchant,gender,yob,salary
0,777,2012-01-03,3.03,aviva pa - d/d,aviva,m,1969.0,20k to 30k
1,777,2012-01-03,6.68,"9572 31dec11 , tesco stores 3345 , warrington ...",tesco,m,1969.0,20k to 30k
2,777,2012-01-03,10.27,"9572 30dec11 , mcdonalds , restaurant , winwic...",mcdonalds,m,1969.0,20k to 30k


In [31]:
df.date.dt.year[:5] + np.random.randint(0, 3, size=5)

0    2012
1    2013
2    2012
3    2013
4    2014
Name: date, dtype: int64

In [44]:
def randomise_date(series):
    
    return pd.to_datetime({
        'year': series.dt.year - np.random.randint(0, 1, size=len(series)),
        'month': series.dt.month,
        'day': series.dt.day
    })

randomise_date(df.date)

0        2012-01-03
1        2012-01-03
2        2012-01-03
3        2012-01-03
4        2012-01-03
            ...    
157282   2020-07-01
157283   2020-07-01
157284   2020-07-01
157285   2020-07-02
157286   2020-07-02
Length: 157287, dtype: datetime64[ns]

In [38]:
year = [2000, 2000, 2000]
month = [1, 2, 3]
day = [4, 5, 6]

pd.to_datetime({
    'year': year,
    'month': month,
    'day': day
})

0   2000-01-04
1   2000-02-05
2   2000-03-06
dtype: datetime64[ns]

array([0, 2, 1, 2, 2, 0, 1, 1, 1, 2])

## Grouping

Create a dictionary from groups based on column types:

In [10]:
df = sns.load_dataset('iris')
pieces = dict(list(df.groupby('species')))
pieces['setosa'].head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Topics

## Sources

- [Fluent Python](https://www.oreilly.com/library/view/fluent-python/9781491946237/)
- [Python Cookbook](https://www.oreilly.com/library/view/python-cookbook-3rd/9781449357337/)
- [Learning Python](https://www.oreilly.com/library/view/learning-python-5th/9781449355722/)
- [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/structure/)
- [Effective Python](https://effectivepython.com)
- [Python for Data Analysis](https://www.oreilly.com/library/view/python-for-data/9781491957653/)
- [Python Data Science Handbook](https://www.oreilly.com/library/view/python-data-science/9781491912126/)