# Summary Stats Using Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
tips = sns.load_dataset('tips')

tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## 1. Counting

In [4]:
print(tips[['sex', 'smoker']].value_counts())
print()
print(tips[['sex', 'smoker']].value_counts(normalize=True, sort=True))b

SyntaxError: invalid syntax (<ipython-input-4-4845d2db438d>, line 3)

In [None]:
# another option is to use cross tab
# total relative frequencies
pd.crosstab(tips['sex'], tips['day'], normalize='all', margins=True)

In [None]:
# row-wise relative frequencies, i.e. each column frequencies sums to 1.0
pd.crosstab(tips['sex'], tips['day'], normalize=1, margins=True)

In [None]:
# column-wise relative frequencies, i.e. each row sums to 1.0
pd.crosstab(tips['sex'], tips['day'], normalize=0, margins=True)

## 2. Summary Statistics on Columns and Rows

In [None]:
tips[['total_bill', 'tip', 'size']].agg(['min', 'max', 'mean'])

In [None]:
tips.groupby('sex')[['total_bill', 'tip', 'size']].agg(['min', 'max', 'mean'])

In [None]:
tips.groupby(['sex', 'smoker'])['tip'].max()

In [None]:
tips.describe()

In [None]:
tips[tips['smoker']=='Yes'].describe()

In [None]:
# who gives higher tips on average (as a perc of bill), men or women?

tips['tip_perc'] = tips['tip']/tips['total_bill']

tips.groupby('sex')['tip_perc'].mean()

In [None]:
# another way is to use pivot table

tips.pivot_table(values='tip_perc', index='sex')

In [None]:
# we can also add an additional grouping by meal time for example

tips.pivot_table(values='tip_perc', index='sex', columns='time', margins=True)

In [None]:
tips_bytime_bysize = tips.pivot_table(values='tip', index=['time', 'size'], columns='day', fill_value=0)

tips_bytime_bysize

In [None]:
# mean of tip per day

tips_bytime_bysize.mean()

In [None]:
# get average value of tips by size

tips_bytime_bysize.mean(axis='columns')

## 3. Visualizing 

In [None]:
# how does the total bill value relate to the tip value overall?

tips.plot(x='total_bill', y='tip', kind='scatter', title='tip versus total bill value');

In [None]:
# what is the distribution of tip and total bill values in the data

tips[['tip', 'total_bill']].hist(bins=20);

In [None]:
# How much tips in total was given at different times ?

tips_bytime = tips.groupby('time')['tip'].sum()

tips_bytime.plot(kind='bar');