# Pandas: grouping

In [228]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [181]:
df = pd.read_csv("vehicles.csv")

# How many car models? 

In [182]:
df['Model'].value_counts().count()

3608

# Group by brand

In [183]:
df.groupby('Make').Model.count()

Make
AM General                        4
ASC Incorporated                  1
Acura                           302
Alfa Romeo                       41
American Motors Corporation      22
                               ... 
Volkswagen                     1047
Volvo                           717
Wallace Environmental            32
Yugo                              8
smart                            20
Name: Model, Length: 127, dtype: int64

*Converting Grams/Mile to Grams/Km 1 Mile = 1.60934 Km*

*Converting Gallons to Liters: 1 Gallon = 3.78541 Liters*

# Brand with the most cars?

In [184]:
most_models = df.groupby('Make')[['Model']].count()
most_models.sort_values('Model')

Unnamed: 0_level_0,Model
Make,Unnamed: 1_level_1
Mahindra,1
General Motors,1
Qvale,1
Fisker,1
S and S Coach Company E.p. Dutton,1
...,...
Toyota,1836
GMC,2347
Dodge,2360
Ford,2946


# Average CO2_Emission_Grams/Km by brand

In [185]:
df['CO2 Emission Grams/Mile'] = df['CO2 Emission Grams/Mile']/1.60934
df.groupby(['Make'])[['CO2 Emission Grams/Mile']].mean().sort_values('CO2 Emission Grams/Mile').tail()

Unnamed: 0_level_0,CO2 Emission Grams/Mile
Make,Unnamed: 1_level_1
Laforza Automobile Inc,502.012683
Bugatti,542.497235
Superior Coaches Div E.p. Dutton,552.213951
S and S Coach Company E.p. Dutton,552.213951
Vector,651.919248


# (Optional) 

Use `pd.cut` or `pd.qcut` to create 4 groups (bins) of cars, by Year. We want to explore how cars have evolved decade by decade.

In [212]:
df.Year.describe()

count    35952.00000
mean      2000.71640
std         10.08529
min       1984.00000
25%       1991.00000
50%       2001.00000
75%       2010.00000
max       2017.00000
Name: Year, dtype: float64

In [213]:
df.bins = pd.cut(x = df.Year, bins = [1980, 1990, 2000, 2010])
years = df.groupby('bins')['Year'].count()
years

bins
(1980, 1990]     7926
(1990, 2000]     9169
(2000, 2010]    10866
Name: Year, dtype: int64

### Did cars consume more gas in the eighties?

show the average City_Km/Liter by year_range

In [218]:
df.groupby('bins')[['City MPG']].mean() # yes

Unnamed: 0_level_0,City MPG
bins,Unnamed: 1_level_1
"(1980, 1990]",17.287913
"(1990, 2000]",16.945359
"(2000, 2010]",16.955273


### Which brands are more environment friendly?

In [220]:
df.groupby('Make')[['CO2 Emission Grams/Mile']].min()

Unnamed: 0_level_0,CO2 Emission Grams/Mile
Make,Unnamed: 1_level_1
AM General,324.831736
ASC Incorporated,345.133719
Acura,141.672984
Alfa Romeo,196.975158
American Motors Corporation,262.959024
...,...
Volkswagen,124.274547
Volvo,147.886711
Wallace Environmental,290.638922
Yugo,212.389981


### Does the drivetrain affect fuel consumption?

In [236]:
drivetrain_consumption = df.groupby('Drivetrain')[['Fuel Barrels/Year']].mean()
drivetrain_consumption.sort_values('Fuel Barrels/Year') # yes, there is a difference if we do not clean up data regarding 2 wheel drive as it appears twice with different naming

Unnamed: 0_level_0,Fuel Barrels/Year
Drivetrain,Unnamed: 1_level_1
"2-Wheel Drive, Front",11.771786
Front-Wheel Drive,14.266654
All-Wheel Drive,16.349672
4-Wheel Drive,17.942952
Rear-Wheel Drive,19.587486
4-Wheel or All-Wheel Drive,20.48472
Part-time 4-Wheel Drive,20.628218
2-Wheel Drive,21.069467


In [235]:
# off topic
# filtering syntax
df[df.Drivetrain == '2-Wheel Drive']['Fuel Barrels/Year'].mean()

21.069466815225585

### Do cars with automatic transmission consume more fuel than cars with manual transmission?

In [239]:
df.Transmission.unique()

array(['Automatic 3-spd', 'Automatic 4-spd', 'Manual 5-spd',
       'Automatic (S5)', 'Manual 6-spd', 'Automatic 5-spd', 'Auto(AM8)',
       'Auto(AM-S8)', 'Auto(AV-S7)', 'Automatic (S6)', 'Automatic (S9)',
       'Automatic (S4)', 'Auto(AM-S9)', 'Automatic (S7)', 'Auto(AM7)',
       'Auto(AM-S7)', 'Auto(AM6)', 'Automatic 6-spd', 'Manual 4-spd',
       'Automatic (S8)', 'Manual(M7)', 'Auto(AM-S6)',
       'Automatic (variable gear ratios)', 'Automatic (AV)',
       'Auto(AV-S8)', 'Automatic (AM6)', 'Automatic 8-spd', 'Auto(A1)',
       'Automatic (A1)', 'Automatic (A6)', 'Auto(AV-S6)', 'Manual 3-spd',
       'Manual 7-spd', 'Automatic 9-spd', 'Auto (AV)', 'Automatic 6spd',
       'Auto(L4)', 'Auto(L3)', 'Auto (AV-S6)', 'Auto (AV-S8)',
       'Automatic (AV-S6)', 'Automatic 7-spd', 'Manual 5 spd',
       'Auto(AM5)', 'Automatic (AM5)'], dtype=object)

Use `groupby` and `agg` with different aggregation measures for different columns:

In [None]:
# 

aggregate with average City_Km/Liter and the count of the Trans

In [143]:
## your code is here

Unnamed: 0_level_0,City_Km/Liter,Trans
Trans,Unnamed: 1_level_1,Unnamed: 2_level_1
Automatic,7.278292,24290
Manual,7.968348,11662


aggregate with average City_Km/Liter and the minimum of the Trans

In [144]:
### your code is here

Unnamed: 0_level_0,City_Km/Liter
Trans,Unnamed: 1_level_1
Automatic,2.976
Manual,2.550857


## Off topic: learn groupby method

In [92]:
# learn groupby
l = [[1, 2, 1], [5, 2, 1], [2, 1, 1], [1, 2, 1]]
df = pd.DataFrame(l, columns=["a", "b", "c"])
df

Unnamed: 0,a,b,c
0,1,2,1
1,5,2,1
2,2,1,1
3,1,2,1


In [93]:
# learn groupby
df.groupby(["a"]).count()
# it basically just counts how many 1's, 2's and 5's there are in the 'a' column
# it is not counting the other columns at all
# just replicates the result

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,2
2,1,1
5,1,1
