# Categoricals and groupby

## Sales data

In [3]:
import pandas as pd

In [4]:
sales = pd.DataFrame({
    'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
    'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
    'bread': [139, 237, 327, 456],
    'butter': [20,45,70,98]
})
sales

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,237,45,Dallas,Sun
2,327,70,Austin,Mon
3,456,98,Dallas,Mon


### Boolean filtering and count

In [5]:
sales.loc[sales['weekday']=='Sun'].count()

bread      2
butter     2
city       2
weekday    2
dtype: int64

### Groupby and count

In [6]:
sales.groupby('weekday').count()

Unnamed: 0_level_0,bread,butter,city
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


### Split-apply-combine

`sales.groupby('weekday.count()`

- split by 'weekdays'
- apply `count()` function on each group
- combine counts per group



### Aggregatin/Reduction

Some reducing functions
- `mean()`
- `std()`
- `sum()`
- `first()`, `last()`
- `min()`, `max()`

### Groupby and sum

What was the total amount of bread soldon each day?

In [7]:
sales.groupby('weekday')['bread'].sum()

weekday
Mon    783
Sun    376
Name: bread, dtype: int64

### Groupby and sum: multiple columns

In [8]:
sales.groupby('weekday')[['bread','butter']].sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,783,168
Sun,376,65


### Groupby and mean : multi-level index

- creates a sorted multilevel index
- 

In [9]:
sales.groupby(['city','weekday']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,327,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,237,45


### Customers

- create a customers series that tells us who made the purchases in our sales dataframe
- customers has an identical index to sales, namely a range starting at 0
-

In [10]:
customers = pd.Series(['Dave', 'Alice', 'Bob', 'Alice'])
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

### Groupby and sum: by series

In [11]:
sales.groupby(customers)['bread'].sum()

Alice    693
Bob      327
Dave     139
Name: bread, dtype: int64

### Categorical data
- Advantages
    - uses less memory
    - speeds up operations like `groupby()`
  
- `.unique()` : returns an array of the distict entries
- `value_counts()`: returns the times each value occurs

In [12]:
sales['weekday']

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: object

In [13]:
sales['weekday'].unique()

array(['Sun', 'Mon'], dtype=object)

In [14]:
sales['weekday'].value_counts()

Sun    2
Mon    2
Name: weekday, dtype: int64

In [15]:
sales['weekday'] = sales['weekday'].astype('category')
sales['weekday']

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

---
# Let's Practice!

In [17]:
# Grouping by multiple columns

titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [20]:
# Group titanic by 'pclass'
by_class = titanic.groupby('pclass')

# Aggregate 'survived' column of by_class by count
count_by_class = by_class['survived'].count()

# Print count_by_class
print(count_by_class)

# Group titanic by 'embarked' and 'pclass'
by_mult = titanic.groupby(['embarked','pclass'])

# Aggregate 'survived' column of by_mult by count
count_mult = by_mult['survived'].count()

# Print count_mult
print(count_mult)

pclass
1    323
2    277
3    709
Name: survived, dtype: int64
embarked  pclass
C         1         141
          2          28
          3         101
Q         1           3
          2           7
          3         113
S         1         177
          2         242
          3         495
Name: survived, dtype: int64


In [23]:
# Grouping by another series
# Read life_fname into a DataFrame: life
life = pd.read_csv('life_expectancy.csv', index_col='Country')

# Read regions_fname into a DataFrame: regions
regions = pd.read_csv('regions.csv', index_col='Country')

# Group life by regions['region']: life_by_region
life_by_region = life.groupby(regions['region'])

# Print the mean over the '2010' column of life_by_region
print(life_by_region['2010'].mean())

region
America                       74.037350
East Asia & Pacific           73.405750
Europe & Central Asia         75.656387
Middle East & North Africa    72.805333
South Asia                    68.189750
Sub-Saharan Africa            57.575080
Name: 2010, dtype: float64


---
# Groupby and aggretation

In [24]:
sales

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,237,45,Dallas,Sun
2,327,70,Austin,Mon
3,456,98,Dallas,Mon


### Review: groupby

In [25]:
sales.groupby('city')[['bread', 'butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,327,70
Dallas,456,98


### Multiple aggrecations

In [26]:
sales.groupby('city')[['bread', 'butter']].agg(['max', 'sum'])

Unnamed: 0_level_0,bread,bread,butter,butter
Unnamed: 0_level_1,max,sum,max,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,327,466,70,90
Dallas,456,693,98,143


### Aggregation functions

- string names
    - 'sum'
    - 'mean'
    - 'count'    

### Custom aggregation

In [27]:
def data_range(series):
    return series.max() - series.min()

In [28]:
sales.groupby('weekday')[['bread', 'butter']].agg(data_range)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,129,28
Sun,98,25


### Custom aggregation: dictionaries

In [33]:
sales.groupby(customers)[['bread','butter']].agg({
    'bread':'sum',
    'butter':data_range})

Unnamed: 0,bread,butter
Alice,693,53
Bob,327,0
Dave,139,0


---
# Let's Practice!

In [34]:
# Computing multiple aggregates of multiple columns
# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')

# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]

# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max','median'])

# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])

# Print the median fare in each class
print(aggregated.loc[:, ('fare', 'median')])

pclass
1    80.0
2    70.0
3    74.0
Name: (age, max), dtype: float64
pclass
1    60.0000
2    15.0458
3     8.0500
Name: (fare, median), dtype: float64


In [45]:
# Aggregating on index levels/fields
#load data

gapminder = pd.read_csv('gapminder_tidy.csv', index_col=['Year', 'region', 'Country'])


# Read the CSV file into a DataFrame and sort the index: gapminder
#gapminder = pd.read_csv('gapminder.csv', index_col=['Year', 'region', 'Country']).sort_index()

# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level=['Year', 'region'])

# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}

# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

# Print the last 6 entries of aggregated 
print(aggregated.tail(6))

                                   population  child_mortality       gdp
Year region                                                             
2013 America                     9.629087e+08        17.745833   49634.0
     East Asia & Pacific         2.244209e+09        22.285714  134744.0
     Europe & Central Asia       8.968788e+08         9.831875   86418.0
     Middle East & North Africa  4.030504e+08        20.221500  128676.0
     South Asia                  1.701241e+09        46.287500   11469.0
     Sub-Saharan Africa          9.205996e+08        76.944490   32035.0


In [48]:
# Grouping on a function of the index
# load data
sales = pd.read_csv('sales-feb-2015.csv', index_col='Date', parse_dates=True)

# Read file: sales
#sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Create a groupby object: by_day
by_day = sales.groupby(sales.index.strftime('%a'))

# Create sum: units_sum
units_sum = by_day['Units'].sum()

# Print units_sum
print(units_sum)

Mon    48
Sat     7
Thu    59
Tue    13
Wed    48
Name: Units, dtype: int64


---
# Groupby and transformation

### The z-score

- distance from teh meand of its population measured in units of standar deviation

-  this function  is a transformationin  that accepts a Series as input and returns a conforming Series

In [49]:
def zscore(series):
    return (series - series.mean()) / series.std()

In [50]:
!ls

01Extracting_and_transforming_data.ipynb  regions.csv
02Advanced_indexing.ipynb		  sales.csv
03Rearranging_and_reshaping_data.ipynb	  sales-feb-2015.csv
04Grouping_data.ipynb			  titanic.csv
all_medalists.csv			  trials_01.csv
gapminder_tidy.csv			  trials_02.csv
life_expectancy.csv			  trials_03.csv
pennsylvania2012_turnout.csv		  users.csv
pittsburgh2013.csv			  visitors.csv


In [61]:
auto = pd.read_csv('auto-mpg.csv')
auto.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name,color,size,marker
0,18.0,6,250.0,88,3139,14.5,71,US,ford mustang,red,27.370336,o
1,9.0,8,304.0,193,4732,18.5,70,US,hi 1200d,green,62.199511,o
2,36.1,4,91.0,60,1800,16.4,78,Asia,honda civic cvcc,blue,9.0,x
3,18.5,6,250.0,98,3525,19.0,77,US,ford granada,red,34.515625,o
4,34.3,4,97.0,78,2188,15.8,80,Europe,audi 4000,blue,13.298178,s


### MPG z-score

In [52]:
zscore(auto['mpg']).head()

0   -0.697747
1   -1.850853
2    1.621277
3   -0.633685
4    1.390656
Name: mpg, dtype: float64

### MPG z-score by year

In [53]:
auto.groupby('yr')['mpg'].transform(zscore).head()

0   -0.466040
1   -1.627511
2    1.745261
3   -0.730243
4    0.072075
Name: mpg, dtype: float64

### Apply transformation and aggregation

In [54]:

def zscore_with_year_and_name(group):
    df = pd.DataFrame({
        'mpg': zscore(group['mpg']),
        'year': group['yr'],
        'name': group['name']
    })
    return df

In [55]:
auto.groupby('yr').apply(zscore_with_year_and_name).head()

Unnamed: 0,mpg,name,year
0,-0.46604,ford mustang,71
1,-1.627511,hi 1200d,70
2,1.745261,honda civic cvcc,78
3,-0.730243,ford granada,77
4,0.072075,audi 4000,80


---
# Let's Practice!

In [108]:
# Detecting outliers with Z-Scores

# # Import zscore
# from scipy.stats import zscore

# # Group gapminder_2010: standardized
# standardized = gapminder_2010.groupby('region')[['life', 'fertility']].transform(zscore)

# # Construct a Boolean Series to identify outliers: outliers
# outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)

# # Filter gapminder_2010 by the outliers: gm_outliers
# gm_outliers = gapminder_2010.loc[outliers]

# # Print gm_outliers
# print(gm_outliers)

In [112]:
# Filling missing data (imputation) by group

# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['sex', 'pclass'])

# Write a function that imputes median
def impute_median(series):
    return series.fillna(series.median())

# Impute age and assign to titanic['age']
titanic.age = by_sex_class['age'].transform(impute_median)

# Print the output of titanic.tail(10)
print(titanic.tail(10))

      pclass  survived                                     name     sex   age  \
1299       3         0                      Yasbeck, Mr. Antoni    male  27.0   
1300       3         1  Yasbeck, Mrs. Antoni (Selini Alexander)  female  15.0   
1301       3         0                     Youseff, Mr. Gerious    male  45.5   
1302       3         0                        Yousif, Mr. Wazli    male  25.0   
1303       3         0                    Yousseff, Mr. Gerious    male  25.0   
1304       3         0                     Zabour, Miss. Hileni  female  14.5   
1305       3         0                    Zabour, Miss. Thamine  female  22.0   
1306       3         0                Zakarian, Mr. Mapriededer    male  26.5   
1307       3         0                      Zakarian, Mr. Ortin    male  27.0   
1308       3         0                       Zimmerman, Mr. Leo    male  29.0   

      sibsp  parch  ticket     fare cabin embarked boat   body home.dest  
1299      1      0    2659  14.45

In [113]:
# Other transformations with .apply


# # Group gapminder_2010 by 'region': regional
# regional = gapminder_2010.groupby('region')

# # Apply the disparity function on regional: reg_disp
# reg_disp = regional.apply(disparity)

# # Print the disparity of 'United States', 'United Kingdom', and 'China'
# print(reg_disp.loc[['United States','United Kingdom','China']])


---
# Groupby and filtering


### Mean MPG by year

In [115]:
auto.groupby('yr')['mpg'].mean()

yr
70    17.689655
71    21.111111
72    18.714286
73    17.100000
74    22.769231
75    20.266667
76    21.573529
77    23.375000
78    24.061111
79    25.093103
80    33.803704
81    30.185714
82    32.000000
Name: mpg, dtype: float64

### Groupby object

- yearly average only for cars build by Cheverolet?
    1. filter
    2. aggregate

In [116]:
splitting = auto.groupby('yr')
type(splitting)

pandas.core.groupby.DataFrameGroupBy

In [120]:
#keys = years, vals = coresponding rows of dataframe

type(splitting.groups)

dict

In [121]:
splitting.groups.keys()

dict_keys([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82])

### groupby object :iteration

In [124]:
for group_name, group in splitting:
    avg = group['mpg'].mean()
    print(group_name, avg)

70 17.689655172413794
71 21.11111111111111
72 18.714285714285715
73 17.1
74 22.76923076923077
75 20.266666666666666
76 21.573529411764707
77 23.375
78 24.06111111111111
79 25.09310344827586
80 33.803703703703704
81 30.185714285714283
82 32.0


### groupby object : iteration and filtering

In [125]:
for group_name, group in splitting:
    avg = group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
    print(group_name, avg)

70 15.666666666666666
71 20.25
72 15.333333333333334
73 14.833333333333334
74 18.666666666666668
75 17.666666666666668
76 23.25
77 20.25
78 23.233333333333334
79 21.666666666666668
80 30.05
81 23.5
82 29.0


### groupby object : comprehension

Dictionary comprehension

- keys are the years
- values are filter averages

In [127]:
chevy_means = {
    year: group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
    for year, group in splitting
}
pd.Series(chevy_means)

70    15.666667
71    20.250000
72    15.333333
73    14.833333
74    18.666667
75    17.666667
76    23.250000
77    20.250000
78    23.233333
79    21.666667
80    30.050000
81    23.500000
82    29.000000
dtype: float64

### Boolean groupby

Boolean column that is true for each automobile manufacture by chevy

- multilevel groupby using boolean col chevy and the 'yr' cols
- chainging the mean() after results in yearly averages MPG ratings for chevy autos and for all other autos

Results show that Chevy MPG is worst than its competitors from 70-82 except 76

In [128]:
chevy = auto['name'].str.contains('chevrolet')
auto.groupby(['yr', chevy])['mpg'].mean()

yr  name 
70  False    17.923077
    True     15.666667
71  False    21.260870
    True     20.250000
72  False    19.120000
    True     15.333333
73  False    17.500000
    True     14.833333
74  False    23.304348
    True     18.666667
75  False    20.555556
    True     17.666667
76  False    21.350000
    True     23.250000
77  False    23.895833
    True     20.250000
78  False    24.136364
    True     23.233333
79  False    25.488462
    True     21.666667
80  False    34.104000
    True     30.050000
81  False    30.433333
    True     23.500000
82  False    32.461538
    True     29.000000
Name: mpg, dtype: float64

---
# Let's Practice!

In [130]:
# Grouping and filtering with .apply()
def c_deck_survival(gr):
    c_passengers = gr['cabin'].str.startswith('C').fillna(False)

    return gr.loc[c_passengers, 'survived'].mean()
# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('sex')

# Call by_sex.apply with the function c_deck_survival and print the result
c_surv_by_sex = by_sex.apply(c_deck_survival)

# Print the survival rates
print(c_surv_by_sex)


sex
female    0.913043
male      0.312500
dtype: float64


In [133]:
# Grouping and filtering with .filter()
# Read the CSV file into a DataFrame: sales
#sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Group sales by 'Company': by_company
by_company = sales.groupby('Company')

# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
print(by_com_sum)

# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g:g['Units'].sum() > 35)
print(by_com_filt)

Company
Acme Coporation    34
Hooli              30
Initech            30
Mediacore          45
Streeplex          36
Name: Units, dtype: int64
                       Company   Product  Units
Date                                           
2015-02-02 21:00:00  Mediacore  Hardware      9
2015-02-04 15:30:00  Streeplex  Software     13
2015-02-09 09:00:00  Streeplex   Service     19
2015-02-09 13:00:00  Mediacore  Software      7
2015-02-19 11:00:00  Mediacore  Hardware     16
2015-02-19 16:00:00  Mediacore   Service     10
2015-02-21 05:00:00  Mediacore  Software      3
2015-02-26 09:00:00  Streeplex   Service      4


In [146]:
# Filtering and grouping with .map()
titanic.head(4)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"


In [144]:
# Create the Boolean Series: under10
under10 = (titanic['age']< 10).map({True: 'under 10', False: 'over 10'})

# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10)['survived'].mean()
print(survived_mean_1)

# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10, 'pclass'])['survived'].mean()
print(survived_mean_2)


age
over 10     0.366748
under 10    0.609756
Name: survived, dtype: float64
age       pclass
over 10   1         0.617555
          2         0.380392
          3         0.238897
under 10  1         0.750000
          2         1.000000
          3         0.446429
Name: survived, dtype: float64
