# Using Pandas

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 200)
## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<b>load the data from the vehicles.csv file into pandas data frame

In [3]:
## Your Code here
vehicles_df = pd.read_csv("data/vehicles.csv")
vehicles_df

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


First exploration of the dataset:

- How many observations does it have?
- Look at all the columns: do you understand what they mean?
- Look at the raw data: do you see anything weird?
- Look at the data types: are they the expected ones for the information the column contains?

In [4]:
#To check for how many observations there are
#Use the .info() function

vehicles_df.info()
# There are 3592 rows in the observation
print("\n")
#Observe the columns only
print(vehicles_df.columns, "\n")

#observing the raw data to look for outliers
vehicles_df.describe()

#Viewing the data types
type(vehicles_df.dtypes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Make                     35952 non-null  object 
 1   Model                    35952 non-null  object 
 2   Year                     35952 non-null  int64  
 3   Engine Displacement      35952 non-null  float64
 4   Cylinders                35952 non-null  float64
 5   Transmission             35952 non-null  object 
 6   Drivetrain               35952 non-null  object 
 7   Vehicle Class            35952 non-null  object 
 8   Fuel Type                35952 non-null  object 
 9   Fuel Barrels/Year        35952 non-null  float64
 10  City MPG                 35952 non-null  int64  
 11  Highway MPG              35952 non-null  int64  
 12  Combined MPG             35952 non-null  int64  
 13  CO2 Emission Grams/Mile  35952 non-null  float64
 14  Fuel Cost/Year        

Unnamed: 0,Year,Engine Displacement,Cylinders,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
count,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0,35952.0
mean,2000.7164,3.338493,5.765076,17.609056,17.646139,23.880646,19.929322,475.316339,1892.598465
std,10.08529,1.359395,1.755268,4.467283,4.769349,5.890876,5.112409,119.060773,506.958627
min,1984.0,0.6,2.0,0.06,6.0,9.0,7.0,37.0,600.0
25%,1991.0,2.2,4.0,14.699423,15.0,20.0,16.0,395.0,1500.0
50%,2001.0,3.0,6.0,17.347895,17.0,24.0,19.0,467.736842,1850.0
75%,2010.0,4.3,6.0,20.600625,20.0,27.0,23.0,555.4375,2200.0
max,2017.0,8.4,16.0,47.087143,58.0,61.0,56.0,1269.571429,5800.0


pandas.core.series.Series

### Cleaning and wrangling data

- Some car brand names refer to the same brand. Replace all brand names that contain the word "Dutton" for simply "Dutton". If you find similar examples, clean their names too. Use `loc` with boolean indexing.

- Convert CO2 Emissions from Grams/Mile to Grams/Km

- Create a binary column that solely indicates if the transmission of a car is automatic or manual. Use `pandas.Series.str.startswith` and .

- convert MPG columns to km_per_liter

Note:
<br>Converting Grams/Mile to Grams/Km

1 Mile = 1.60934 Km

Converting Gallons to Liters

1 Gallon = 3.78541 Liters



In [19]:
# I Search and replace Brand names that contain Dutton
vehicles_df[vehicles_df.Make.str.contains('Dutton')]
vehicles_df.loc[vehicles_df['Make'].str.contains('Dutton', case = False), 'Make'] = 'Dutton'
print(vehicles_df)
print('\n')


# Converting CO2 Emissions from Grams/Mile to Grams/Km
vehicles_df['CO2 Emission Grams/Km'] = vehicles_df['CO2 Emission Grams/Mile'] / 1.60934
print('\n')


#Binary column that solely indicates if transmission is manual or automatic
def trans_replace(name):
    if name.startswith("A"):
        return 1
    else:
        return 0

vehicles_df["Transmission 1/0"] =vehicles_df.Transmission.apply(trans_replace)
vehicles_df                 
print('\n')

#convert MPG columns to km_per_liter
vehicles_df ["City KPL"] = vehicles_df["City MPG"] / 3.78541
vehicles_df['Highway KPL'] = vehicles_df['Highway MPG'] / 3.78541
vehicles_df['Combined KPL'] = vehicles_df['Combined MPG'] / 3.78541

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,CO2 Emission Grams/Km,Transmission 1/0,City KPL,Highway KPL,Combined KPL
11012,Dutton,Funeral Coach,1985,4.1,8.0,Automatic 4-spd,Front-Wheel Drive,Special Purpose Vehicles,Regular,19.388824,15,21,17,522.764706,1950,324.831736,1,3.962583,5.547616,4.490927
30164,Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,9,11,10,888.7,3350,552.213951,1,2.37755,2.905894,2.641722
31754,Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,10,11,10,888.7,3350,552.213951,1,2.641722,2.905894,2.641722


                   Make                Model  Year  Engine Displacement  \
0            AM General    DJ Po Vehicle 2WD  1984                  2.5   
1            AM General     FJ8c Post Office  1984                  4.2   
2            AM General  Post Office DJ5 2WD  1985                  2.5   
3            AM General  Post Office DJ8 2WD  1985                  4.2   
4      ASC Incorporated                  GNX  1987                  3.8   
...                 ...                  ...   ...                  ...   
35947             smart         fortwo coupe  2013                  1.0   
35948             smart         fortwo coupe  2014                  1.0   
35949             smart         fortwo coupe  2015                  1.0   
35950             smart         fortwo coupe  2016                  0.9   
35951             smart         fortwo coupe  2016                  0.9   

       Cylinders     Transmission        Drivetrain  \
0            4.0  Automatic 3-spd     2-Whee

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,CO2 Emission Grams/Km,Transmission 1/0,City KPL,Highway KPL,Combined KPL
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,324.831736,1,4.755099,4.490927,4.490927
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,424.779962,1,3.434238,3.434238,3.434238
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,345.133719,1,4.226755,4.490927,4.226755
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,424.779962,1,3.434238,3.434238,3.434238
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550,345.133719,1,3.698410,5.547616,4.226755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,151.614948,1,8.981854,10.038543,9.510198
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100,150.993575,1,8.981854,10.038543,9.510198
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100,151.614948,1,8.981854,10.038543,9.510198
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100,152.857693,1,8.981854,10.302715,9.510198






### Gathering insights:

- How many car makers are there? How many models? Which car maker has the most cars in the dataset?

- When were these cars made? How big is the engine of these cars?

- What's the frequency of different transmissions, drivetrains and fuel types?

- What's the car that consumes the least/most fuel?

In [6]:
# How many car makers are there?
len(vehicles_df['Make'].unique())

#How many models?
len(vehicles_df['Model'].unique())

#Which car maker has the most cars in the dataset?
vehicles_df['Make'].mode()

125

3608

0    Chevrolet
Name: Make, dtype: object

In [7]:
#years cars were made
vehicles_df.iloc[ : , [0,2]]

#how big is the engine
vehicles_df.iloc[ : , [0,4]]

#frequency of different transmissions, drivetrains and fuel types
vehicles_df2 = vehicles_df.groupby(['Transmission','Drivetrain','Fuel Type']).count()
vehicles_df2

Unnamed: 0,Make,Year
0,AM General,1984
1,AM General,1984
2,AM General,1985
3,AM General,1985
4,ASC Incorporated,1987
...,...,...
35947,smart,2013
35948,smart,2014
35949,smart,2015
35950,smart,2016


Unnamed: 0,Make,Cylinders
0,AM General,4.0
1,AM General,6.0
2,AM General,4.0
3,AM General,6.0
4,ASC Incorporated,6.0
...,...,...
35947,smart,3.0
35948,smart,3.0
35949,smart,3.0
35950,smart,3.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Make,Model,Year,Engine Displacement,Cylinders,Vehicle Class,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,CO2 Emission Grams/Km,Transmission 1/0,City KPL,Highway KPL,Combined KPL
Transmission,Drivetrain,Fuel Type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Auto (AV),Front-Wheel Drive,Premium Gas or Electricity,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Auto (AV-S6),Rear-Wheel Drive,Premium,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Auto (AV-S8),All-Wheel Drive,Premium,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Auto(A1),Rear-Wheel Drive,Premium Gas or Electricity,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Auto(AM-S6),All-Wheel Drive,Premium,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Manual 6-spd,Rear-Wheel Drive,Regular,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161,161
Manual 7-spd,4-Wheel Drive,Premium,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33
Manual 7-spd,All-Wheel Drive,Premium,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Manual 7-spd,Rear-Wheel Drive,Premium,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33


In [8]:
#The car that consumes the least fuel
vehicles_df['Fuel Barrels/Year'].min()

#The car that consumes the most fuel
vehicles_df['Fuel Barrels/Year'].max()

0.06

47.08714285714285

In [9]:
#When were these cars made? 
vehicles_df.iloc[: , 2]

0        1984
1        1984
2        1985
3        1985
4        1987
         ... 
35947    2013
35948    2014
35949    2015
35950    2016
35951    2016
Name: Year, Length: 35952, dtype: int64

<b> (Optional)

What brand has the worse CO2 Emissions on average?

Hint: use the function `sort_values()`

In [10]:
## your Code here

vehicles_df3 = vehicles_df.sort_values(by=['Make','CO2 Emission Grams/Km'], ascending = False)
print(vehicles_df3)

                   Make                Model  Year  Engine Displacement  \
35939             smart     fortwo cabriolet  2017                  0.9   
35951             smart         fortwo coupe  2016                  0.9   
35938             smart     fortwo cabriolet  2017                  0.9   
35932             smart     fortwo cabriolet  2010                  1.0   
35933             smart     fortwo cabriolet  2011                  1.0   
...                 ...                  ...   ...                  ...   
4      ASC Incorporated                  GNX  1987                  3.8   
1            AM General     FJ8c Post Office  1984                  4.2   
3            AM General  Post Office DJ8 2WD  1985                  4.2   
2            AM General  Post Office DJ5 2WD  1985                  2.5   
0            AM General    DJ Po Vehicle 2WD  1984                  2.5   

       Cylinders     Transmission        Drivetrain  \
35939        3.0     Manual 5-spd  Rear-Whee

Do cars with automatic transmission consume more fuel than cars with manual transmission on average?

In [32]:
## Your Code is here 
def trans(x):
    if x.startswith ('A'):
        return 'Automatic'
    elif x.startswith('M'):
        return 'Manual'
    else: pass
vehicles_df['Trans'] = vehicles_df['Transmission'].apply(trans)


In [33]:
vehicles_df.groupby(['Trans'])['CO2 Emission Grams/Km'].mean().sort_values(ascending = False)


Trans
Automatic    302.853002
Manual       279.718227
Name: CO2 Emission Grams/Km, dtype: float64