# Pandas essentials

- hide: false
- toc: true
- comments: true
- categories: [python, pandas]

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

Load a sample dataset

In [137]:
cols = {
    'user_id': 'user',
    'transaction_date': 'date',
    'amount': 'amount',
    'transaction_description': 'desc',
    'merchant_name': 'merchant',
    'gender': 'gender',
    'year_of_birth': 'yob',
    'salary_range': 'salary',
}

def randomise_date(series):
    """Add noise to years for additional anonymisation."""
    series = series[~(series.dt.month.eq(2) & series.dt.day.eq(29))]    
    return pd.to_datetime({
        'year': series.dt.year - np.random.randint(0, 5, size=len(series)),
        'month': series.dt.month,
        'day': series.dt.day
    })

fp = './data/sample.parquet'
df = pd.read_parquet(fp, columns=cols).rename(columns=cols)
df['date'] = randomise_date(df.date)
print(df.shape)
df.head(3)

(157287, 8)


Unnamed: 0,user,date,amount,desc,merchant,gender,yob,salary
0,777,2012-01-03,3.03,aviva pa - d/d,aviva,m,1969.0,20k to 30k
1,777,2009-01-03,6.68,"9572 31dec11 , tesco stores 3345 , warrington ...",tesco,m,1969.0,20k to 30k
2,777,2011-01-03,10.27,"9572 30dec11 , mcdonalds , restaurant , winwic...",mcdonalds,m,1969.0,20k to 30k


# Categories

In [34]:
# load dataset
path = 'http://bit.ly/drinksbycountry'
drinks = pd.read_csv(path)
print(drinks.shape)
drinks.head()

(193, 6)


Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [36]:
# inspect memory
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [35]:
# inspect emmory by column
drinks.memory_usage(deep='true')

Index                             128
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

Convert country to category type

In [61]:
drinks.continent = drinks.continent.astype('category')

In [62]:
drinks.memory_usage(deep='true')

Index                             128
country                         18094
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         744
dtype: int64

In [65]:
drinks.continent.cat.categories

Index(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',
       'South America'],
      dtype='object')

In [67]:
df = pd.DataFrame({'id':[1, 2, 3, 4, 5], 'quality':['good', 'excellent', 'very good', 'excellent', 'good']})
df

Unnamed: 0,id,quality
0,1,good
1,2,excellent
2,3,very good
3,4,excellent
4,5,good


In [68]:
df.sort_values('quality')

Unnamed: 0,id,quality
1,2,excellent
3,4,excellent
0,1,good
4,5,good
2,3,very good


In [79]:
from pandas.api.types import CategoricalDtype
quality_cat = CategoricalDtype(['good', 'very good', 'excellent'], ordered=True)
df.quality = df.quality.astype(quality_cat)
df

Unnamed: 0,id,quality
0,1,good
1,2,excellent
2,3,very good
3,4,excellent
4,5,good


In [80]:
df.quality

0         good
1    excellent
2    very good
3    excellent
4         good
Name: quality, dtype: category
Categories (3, object): [good < very good < excellent]

In [85]:
dummies = pd.get_dummies(df.quality)
df = pd.concat([df, dummies], axis=1)
df

Unnamed: 0,id,quality,good,very good,excellent
0,1,good,1,0,0
1,2,excellent,0,0,1
2,3,very good,0,1,0
3,4,excellent,0,0,1
4,5,good,1,0,0


# Dates and times

## Parsing string dates

Using `dateutil`

In [19]:
from dateutil.parser import parse
date = '1 Nov 2020'
print(parse(date))
parse(date).month

2020-11-01 00:00:00


11

Inside `Pandas`

In [21]:
print(pd.Timestamp(date))
pd.Timestamp(date).month

2020-11-01 00:00:00


11

## Date and period ranges

In [33]:
# create quarterly date and change frequency to standard date
idx = pd.period_range('2018-1', '2019-1', freq='Q-DEC')
s = pd.Series(np.random.randn(len(idx)), index=idx)
print(s)
s.asfreq('d', how='start')

2018Q1   -0.484997
2018Q2   -0.817007
2018Q3    2.018879
2018Q4   -0.176754
2019Q1   -1.085844
Freq: Q-DEC, dtype: float64


2018-01-01   -0.484997
2018-04-01   -0.817007
2018-07-01    2.018879
2018-10-01   -0.176754
2019-01-01   -1.085844
Freq: D, dtype: float64

In [35]:
# create 100-day series and resample to monthly
idx = pd.date_range('2000', periods=100)
s = pd.Series(np.random.randn(len(idx)), index=idx)
s.resample('M', kind='period').mean()

2000-01   -0.129504
2000-02   -0.040099
2000-03    0.210304
2000-04   -0.038681
Freq: M, dtype: float64

In [40]:
# create hourly series, convert to daily open-high-low-close
idx = pd.date_range('2000', freq='H', periods=100)
s = pd.Series(np.random.randn(len(idx)), index=idx)
s.resample('d').ohlc()

Unnamed: 0,open,high,low,close
2000-01-01,1.478093,2.355484,-1.320692,-1.036503
2000-01-02,0.736884,1.764789,-2.652206,-0.965161
2000-01-03,-0.308438,2.355778,-1.502893,1.16236
2000-01-04,1.10993,1.786124,-1.424925,1.269311
2000-01-05,0.474575,0.487767,0.116583,0.369565


# Grouping

Create a dictionary from groups based on column types:

In [10]:
df = sns.load_dataset('iris')
pieces = dict(list(df.groupby('species')))
pieces['setosa'].head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Fluency exercises

## Create a table that shows the number of planets discovered by each method in each decade

In [16]:
df = sns.load_dataset('planets')
print(df.shape)
df.head(3)

(1035, 6)


Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011


In [29]:
decades = df.year // 10 * 10
decades = decades.astype(str) + 's'
decades.name = 'decade'

In [26]:
# using pivot table
df.pivot_table('number', columns=decades, index='method', aggfunc='sum').fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


In [28]:
# using groupby
df.groupby(['method', decades]).number.sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


In [25]:
df = pd.read_csv('./data/births.csv')
print(df.shape)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/births.csv'

# Sources
- [Python for Data Analysis](https://www.oreilly.com/library/view/python-for-data/9781491957653/)
- [Python Data Science Handbook](https://www.oreilly.com/library/view/python-data-science/9781491912126/) (PDSH)

<!-- - [Fluent Python](https://www.oreilly.com/library/view/fluent-python/9781491946237/)
- [Python Cookbook](https://www.oreilly.com/library/view/python-cookbook-3rd/9781449357337/)
- [Learning Python](https://www.oreilly.com/library/view/learning-python-5th/9781449355722/)
- [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/structure/)
- [Effective Python](https://effectivepython.com)
- [Python for Data Analysis](https://www.oreilly.com/library/view/python-for-data/9781491957653/)
- [Python Data Science Handbook](https://www.oreilly.com/library/view/python-data-science/9781491912126/) -->