In [80]:
from pydataset import data

In [81]:
import pandas as pd
import numpy as np

In [82]:
data('mpg', show_doc=True) # view the documentation for the dataset

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [83]:
mpg = data('mpg') # load the dataset and store it in a variable

In [84]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [85]:
type(mpg)

pandas.core.frame.DataFrame

###     --------------------------------------------------------------------------------- ###

In [86]:
np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [87]:
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [88]:
# Create a column for passing English grades
df['passing_english'] = df['english'] >= 70

In [89]:
df

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
9,Richard,69,80,94,True


In [90]:
# Sort the passing english column
df.sort_values(by = 'passing_english', inplace=True)

In [91]:
df

Unnamed: 0,name,math,english,reading,passing_english
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
11,Alan,92,62,72,False
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True


In [92]:
# Sort by passing_english and then by student name
df.sort_values(by = ['passing_english', 'name'], inplace=True)

In [93]:
df

Unnamed: 0,name,math,english,reading,passing_english
11,Alan,92,62,72,False
8,Albert,92,62,87,False
7,Marie,93,63,90,False
6,Thomas,82,64,81,False
4,Ada,77,92,98,True
3,Billy,98,96,88,True
10,Isaac,92,99,93,True
1,Jane,88,79,67,True
5,John,79,76,93,True
9,Richard,69,80,94,True


In [94]:
# Sort by passing_english and then by english score. 
# Pandas will sort the first col, then the second col...
# After that, if duplicated exist the original order of the rows will be maintained.
df.sort_values(by = ['passing_english', 'english'], inplace=True)

In [95]:
df

Unnamed: 0,name,math,english,reading,passing_english
11,Alan,92,62,72,False
8,Albert,92,62,87,False
7,Marie,93,63,90,False
6,Thomas,82,64,81,False
2,Suzie,94,74,95,True
5,John,79,76,93,True
1,Jane,88,79,67,True
9,Richard,69,80,94,True
0,Sally,62,85,80,True
4,Ada,77,92,98,True


In [96]:
# Concatencate and calculate overall grade as average of math, english, and reading scores
df['overall_grade'] = (df['math'] + df['english'] + df['reading']) / 3

In [97]:
df

Unnamed: 0,name,math,english,reading,passing_english,overall_grade
11,Alan,92,62,72,False,75.333333
8,Albert,92,62,87,False,80.333333
7,Marie,93,63,90,False,82.0
6,Thomas,82,64,81,False,75.666667
2,Suzie,94,74,95,True,87.666667
5,John,79,76,93,True,82.666667
1,Jane,88,79,67,True,78.0
9,Richard,69,80,94,True,81.0
0,Sally,62,85,80,True,75.666667
4,Ada,77,92,98,True,89.0


In [98]:
df['overall_grade'] = df['overall_grade'].astype(int)

In [99]:
df

Unnamed: 0,name,math,english,reading,passing_english,overall_grade
11,Alan,92,62,72,False,75
8,Albert,92,62,87,False,80
7,Marie,93,63,90,False,82
6,Thomas,82,64,81,False,75
2,Suzie,94,74,95,True,87
5,John,79,76,93,True,82
1,Jane,88,79,67,True,78
9,Richard,69,80,94,True,81
0,Sally,62,85,80,True,75
4,Ada,77,92,98,True,89


###     --------------------------------------------------------------------------------- ###

In [100]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [101]:
# Print the number of rows and columns
print("Number of rows:", len(mpg))
print("Number of columns:", len(mpg.columns))

Number of rows: 234
Number of columns: 11


In [102]:
# Print the data types of each column
print(mpg.dtypes)

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object


In [103]:
# Summarize the dataframe with info
print(mpg.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  234 non-null    object 
 1   model         234 non-null    object 
 2   displ         234 non-null    float64
 3   year          234 non-null    int64  
 4   cyl           234 non-null    int64  
 5   trans         234 non-null    object 
 6   drv           234 non-null    object 
 7   cty           234 non-null    int64  
 8   hwy           234 non-null    int64  
 9   fl            234 non-null    object 
 10  class         234 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB
None


In [104]:
# Summarize the dataframe with describe
print(mpg.describe())

            displ         year         cyl         cty         hwy
count  234.000000   234.000000  234.000000  234.000000  234.000000
mean     3.471795  2003.500000    5.888889   16.858974   23.440171
std      1.291959     4.509646    1.611534    4.255946    5.954643
min      1.600000  1999.000000    4.000000    9.000000   12.000000
25%      2.400000  1999.000000    4.000000   14.000000   18.000000
50%      3.300000  2003.500000    6.000000   17.000000   24.000000
75%      4.600000  2008.000000    8.000000   19.000000   27.000000
max      7.000000  2008.000000    8.000000   35.000000   44.000000


In [105]:
# Rename the 'cty' and 'hwy' columns
mpg.rename(columns={'cty': 'city', 'hwy': 'highway'}, inplace=True)

In [106]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [107]:
# Check if any cars have better city mileage than highway mileage
print([mpg['city'] > mpg['highway']] == True)

False


In [108]:
# Create a column for mileage difference
mpg['mileage_difference'] = mpg['highway'] - mpg['city']

In [109]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10
...,...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize,9
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize,8
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize,10
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize,8


In [110]:
# Find the car with the highest mileage difference
print(mpg.loc[mpg['mileage_difference'].idxmax()])

manufacturer               honda
model                      civic
displ                        1.8
year                        2008
cyl                            4
trans                   auto(l5)
drv                            f
city                          24
highway                       36
fl                             c
class                 subcompact
mileage_difference            12
Name: 107, dtype: object


In [111]:
# Find the compact car with the lowest and best highway mileage
compact_mpg = mpg[mpg['class'] == 'compact']
print(compact_mpg.loc[compact_mpg['highway'].idxmin()])

manufacturer          volkswagen
model                      jetta
displ                        2.8
year                        1999
cyl                            6
trans                   auto(l4)
drv                            f
city                          16
highway                       23
fl                             r
class                    compact
mileage_difference             7
Name: 220, dtype: object


In [112]:
# Find the compact car with the best highway mileage
compact_mpg = mpg[mpg['class'] == 'compact']
print(compact_mpg.loc[compact_mpg['highway'].idxmax()])

manufacturer          volkswagen
model                      jetta
displ                        1.9
year                        1999
cyl                            4
trans                 manual(m5)
drv                            f
city                          33
highway                       44
fl                             d
class                    compact
mileage_difference            11
Name: 213, dtype: object


In [113]:
# Create a column for average mileage
mpg['average_mileage'] = (mpg['city'] + mpg['highway']) / 2

In [114]:
# Find the Dodge car with the best average mileage
dodge_mpg = mpg[mpg['manufacturer'] == 'dodge']
print(dodge_mpg.loc[dodge_mpg['average_mileage'].idxmax()])

manufacturer                dodge
model                 caravan 2wd
displ                         2.4
year                         1999
cyl                             4
trans                    auto(l3)
drv                             f
city                           18
highway                        24
fl                              r
class                     minivan
mileage_difference              6
average_mileage              21.0
Name: 38, dtype: object


In [115]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize,9,23.5
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize,8,25.0
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize,10,21.0
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize,8,22.0


In [116]:
# Find the Dodge car with the worst average mileage
dodge_mpg = mpg[mpg['manufacturer'] == 'dodge']
print(dodge_mpg.loc[dodge_mpg['average_mileage'].idxmin()])

manufacturer                      dodge
model                 dakota pickup 4wd
displ                               4.7
year                               2008
cyl                                   8
trans                          auto(l5)
drv                                   4
city                                  9
highway                              12
fl                                    e
class                            pickup
mileage_difference                    3
average_mileage                    10.5
Name: 55, dtype: object


###     --------------------------------------------------------------------------------- ###

In [117]:
data('Mammals', show_doc=True) # view the documentation for the dataset
mamm = data('Mammals') # load the dataset and store it in a variable

Mammals

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Garland(1983) Data on Running Speed of Mammals

### Description

Observations on the maximal running speed of mammal species and their body
mass.

### Usage

    data(Mammals)

### Format

A data frame with 107 observations on the following 4 variables.

weight

Body mass in Kg for "typical adult sizes"

speed

Maximal running speed (fastest sprint velocity on record)

hoppers

logical variable indicating animals that ambulate by hopping, e.g. kangaroos

specials

logical variable indicating special animals with "lifestyles in which speed
does not figure as an important factor": Hippopotamus, raccoon (Procyon),
badger (Meles), coati (Nasua), skunk (Mephitis), man (Homo), porcupine
(Erithizon), oppossum (didelphis), and sloth (Bradypus)

### Details

Used by Chappell (1989) and Koenker, Ng and Portnoy (1994) to illustrate the
fitting of piecewise linear curves.

### Source

Garland, T. (

In [118]:
#View the dataframe
mamm

Unnamed: 0,weight,speed,hoppers,specials
1,6000.0,35.0,False,False
2,4000.0,26.0,False,False
3,3000.0,25.0,False,False
4,1400.0,45.0,False,False
5,400.0,70.0,False,False
6,350.0,70.0,False,False
7,300.0,64.0,False,False
8,260.0,70.0,False,False
9,250.0,40.0,False,False
10,3800.0,25.0,False,True


In [119]:
# Print the number of rows and columns
print("Number of rows:", len(mamm))
print("Number of columns:", len(mamm.columns))

Number of rows: 107
Number of columns: 4


In [120]:
# Print the data types of each column
print(mamm.dtypes)

weight      float64
speed       float64
hoppers        bool
specials       bool
dtype: object


In [121]:
# Summarize the dataframe with info
print(mamm.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 1 to 107
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   weight    107 non-null    float64
 1   speed     107 non-null    float64
 2   hoppers   107 non-null    bool   
 3   specials  107 non-null    bool   
dtypes: bool(2), float64(2)
memory usage: 2.7 KB
None


In [122]:
# Summarize the dataframe with describe
print(mamm.describe())

            weight       speed
count   107.000000  107.000000
mean    278.688178   46.208411
std     839.608269   26.716778
min       0.016000    1.600000
25%       1.700000   22.500000
50%      34.000000   48.000000
75%     142.500000   65.000000
max    6000.000000  110.000000


In [123]:
# Find the weight of the fastest animal
fastest = mamm.loc[mamm['speed'].idxmax()]
print(fastest['weight'])

55.0


In [124]:
# Calculate the percentage of specials
percent_specials = (len(mamm[mamm['specials'] == True]) / len(mamm)) * 100
(f"{percent_specials:.2f}% of all animals has specials.") 

'9.35% of all animals has specials.'

In [125]:
# calculate the percentage of hoppers above the median speed
percentage_hoppers_above_median = (hoppers_above_median / len(mamm)) * 100

In [126]:
# count the number of hoppers above the median speed
hoppers_above_median = len(mamm[(mamm['hoppers'] == True) & (mamm['speed'] > median_speed)])

In [127]:
# Find the number and percentage of hoppers above the median speed
median_speed = mamm['speed'].median()
hoppers_above_median = len(mamm[(mamm['hoppers'] == True) & (mamm['speed'] > median_speed)])
print("Number of hoppers above median mammal speed:", (hoppers_above_median))
percentage_hoppers_above_median = (hoppers_above_median / len(mamm)) * 100
(f"{percentage_hoppers_above_median:.2f}% of hoppers are above all mammals median speed.") 

Number of hoppers above median mammal speed: 7


'6.54% of hoppers are above all mammals median speed.'

In [128]:
# count the number of hoppers above the median speed
hoppers_above_median = len(mamm[(mamm['hoppers'] == True) & (mamm['speed'] > median_speed)])
(f"Number of hoppers above the median is {hoppers_above_median}.")

'Number of hoppers above the median is 7.'

In [129]:
median_speed

48.0

In [130]:
# Retrieve all hoppers
hoppers_df = mamm[mamm['hoppers'] == True]

In [131]:
# Show length/count of hoppers_df
len(hoppers_df)

11

In [132]:
# Show median hopper speed
hoppers_median_speed = hoppers_df['speed'].median()

In [133]:
# Show the number of hoppers above median hop speed
hoppers_above_median_hop_speed = len(mamm[(mamm['hoppers'] == True) & (mamm['speed'] > hoppers_median_speed)])
(f"Number of hoppers above the median hooper speed is {hoppers_above_median_hop_speed}.")

'Number of hoppers above the median hooper speed is 4.'

In [134]:
# percentage of hoppers above median hopper speed
percentage_hoppers_above_median_hop_speed = (hoppers_above_median_hop_speed / len(hoppers_df)) * 100

In [135]:
# Display the results, hoppers above median and percentage of hopper above median
print(f"There are {hoppers_above_median} hoppers above the median speed. "
      f"This is {percentage_hoppers_above_median:.2f}% of all animals.")

There are 7 hoppers above the median speed. This is 6.54% of all animals.


In [136]:
# display the results
print(f"There are {hoppers_above_median_hop_speed} hoppers above the median hop speed ({hoppers_median_speed:.2f}). "
      f"This is {percentage_hoppers_above_median_hop_speed:.2f}% of all hopper animals.") 

There are 4 hoppers above the median hop speed (56.00). This is 36.36% of all hopper animals.
