# Dataframes Exercises

## All the datasets loaded from the pydataset library will be pandas dataframes.

In [1]:
# 1. Copy the code from the lesson to create a dataframe full of 
# student grades.

import pandas as pd
import numpy as np

np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

grades_df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(grades_df)

pandas.core.frame.DataFrame

In [2]:
grades_df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [3]:
# a. Create a column named passing_english that indicates whether 
# each student has a passing grade in english.

grades_df['passing_english'] = grades_df.english >= 70
grades_df.head()

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True


In [4]:
# b. Sort the english grades by the passing_english column. How are 
# duplicates handled?

grades_df.sort_values(by='passing_english')

# Duplicates are secondarily sorted by index number.

Unnamed: 0,name,math,english,reading,passing_english
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
11,Alan,92,62,72,False
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True


In [5]:
# c. Sort the english grades first by passing_english and then by 
# student name. All the students that are failing english should be 
# first, and within the students that are failing english they should 
# be ordered alphabetically. The same should be true for the students 
# passing english. (Hint: you can pass a list to the .sort_values 
# method)

grades_df.sort_values(by=['passing_english', 'name'])

Unnamed: 0,name,math,english,reading,passing_english
11,Alan,92,62,72,False
8,Albert,92,62,87,False
7,Marie,93,63,90,False
6,Thomas,82,64,81,False
4,Ada,77,92,98,True
3,Billy,98,96,88,True
10,Isaac,92,99,93,True
1,Jane,88,79,67,True
5,John,79,76,93,True
9,Richard,69,80,94,True


In [6]:
# d. Sort the english grades first by passing_english, and then by 
# the actual english grade, similar to how we did in the last step.

grades_df.sort_values(by=['passing_english', 'english'], ascending=[True, False])

Unnamed: 0,name,math,english,reading,passing_english
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
11,Alan,92,62,72,False
10,Isaac,92,99,93,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
0,Sally,62,85,80,True
9,Richard,69,80,94,True
1,Jane,88,79,67,True


In [7]:
# e. Calculate each students overall grade and add it as a column on 
# the dataframe. The overall grade is the average of the math, 
# english, and reading grades.

grades_df['overall_grade'] = (grades_df.math + grades_df.english + grades_df.reading) / 3
grades_df

Unnamed: 0,name,math,english,reading,passing_english,overall_grade
0,Sally,62,85,80,True,75.666667
1,Jane,88,79,67,True,78.0
2,Suzie,94,74,95,True,87.666667
3,Billy,98,96,88,True,94.0
4,Ada,77,92,98,True,89.0
5,John,79,76,93,True,82.666667
6,Thomas,82,64,81,False,75.666667
7,Marie,93,63,90,False,82.0
8,Albert,92,62,87,False,80.333333
9,Richard,69,80,94,True,81.0


In [8]:
# 2. Load the mpg dataset. Read the documentation for the dataset 
# and use it for the following questions:

from pydataset import data
mpg_df = data('mpg')
mpg_df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [9]:
# -How many rows and columns are there?

mpg_df.shape

(234, 11)

In [10]:
# -What are the data types of each column?

mpg_df.dtypes

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object

In [11]:
# -Summarize the dataframe with .info and .describe

mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  234 non-null    object 
 1   model         234 non-null    object 
 2   displ         234 non-null    float64
 3   year          234 non-null    int64  
 4   cyl           234 non-null    int64  
 5   trans         234 non-null    object 
 6   drv           234 non-null    object 
 7   cty           234 non-null    int64  
 8   hwy           234 non-null    int64  
 9   fl            234 non-null    object 
 10  class         234 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB


In [12]:
mpg_df.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


In [13]:
# -Rename the cty column to city.

mpg_df = mpg_df.rename(columns={'cty': 'city'})
mpg_df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [14]:
# -Rename the hwy column to highway.

mpg_df = mpg_df.rename(columns={'hwy': 'highway'})
mpg_df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [15]:
# -Do any cars have better city mileage than highway mileage?

mpg_df[mpg_df.city > mpg_df.highway]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class


In [16]:
# -Create a column named mileage_difference this column should contain
# the difference between highway and city mileage for each car.

mpg_df['mileage_difference'] = mpg_df.highway - mpg_df.city
mpg_df.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10


In [17]:
# -Which car (or cars) has the highest mileage difference?

mpg_df.sort_values(by='mileage_difference')
mpg_df[mpg_df.mileage_difference == mpg_df.mileage_difference.max()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


In [18]:
# -Which compact class car has the lowest highway mileage? 

mpg_df[mpg_df['class'] == 'compact'].nsmallest(1, 'highway', keep='all')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
220,volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact,7


In [19]:
# -The best?

mpg_df[mpg_df['class'] == 'compact'].nlargest(1,'highway', keep='all')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
213,volkswagen,jetta,1.9,1999,4,manual(m5),f,33,44,d,compact,11


In [20]:
# -Create a column named average_mileage that is the mean of the city 
# and highway mileage.

mpg_df['average_mileage'] = (mpg_df.city + mpg_df.highway) / 2
mpg_df.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11,23.5
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8,25.0
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11,25.5
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9,25.5
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10,21.0


In [21]:
# - Which dodge car has the best average mileage? 

mpg_df[mpg_df['manufacturer'] == 'dodge'].nlargest(1, 'average_mileage', keep='all')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
38,dodge,caravan 2wd,2.4,1999,4,auto(l3),f,18,24,r,minivan,6,21.0


In [22]:
# -The worst?

mpg_df[mpg_df['manufacturer'] == 'dodge'].nsmallest(1, 'average_mileage', keep='all')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
55,dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup,3,10.5
60,dodge,durango 4wd,4.7,2008,8,auto(l5),4,9,12,e,suv,3,10.5
66,dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup,3,10.5
70,dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,9,12,e,pickup,3,10.5


In [23]:
# 3. Load the Mammals dataset. Read the documentation for it, and use 
# the data to answer these questions:

from pydataset import data
mammals_df = data('Mammals')
mammals_df.head()

Unnamed: 0,weight,speed,hoppers,specials
1,6000.0,35.0,False,False
2,4000.0,26.0,False,False
3,3000.0,25.0,False,False
4,1400.0,45.0,False,False
5,400.0,70.0,False,False


In [24]:
# -How many rows and columns are there?

mammals_df.shape

(107, 4)

In [25]:
# -What are the data types?

mammals_df.dtypes

weight      float64
speed       float64
hoppers        bool
specials       bool
dtype: object

In [26]:
# -Summarize the dataframe with .info and .describe

mammals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 1 to 107
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   weight    107 non-null    float64
 1   speed     107 non-null    float64
 2   hoppers   107 non-null    bool   
 3   specials  107 non-null    bool   
dtypes: bool(2), float64(2)
memory usage: 2.7 KB


In [27]:
mammals_df.describe()

Unnamed: 0,weight,speed
count,107.0,107.0
mean,278.688178,46.208411
std,839.608269,26.716778
min,0.016,1.6
25%,1.7,22.5
50%,34.0,48.0
75%,142.5,65.0
max,6000.0,110.0


In [28]:
# -What is the the weight of the fastest animal?

mammals_df.sort_values(by='speed', ascending=False).head(1)

Unnamed: 0,weight,speed,hoppers,specials
53,55.0,110.0,False,False


In [29]:
# -What is the overal percentage of specials?

round(((sum(mammals_df['specials'] == True) / len(mammals_df)) * 100), 2)

9.35

In [30]:
# -How many animals are hoppers that are above the median speed? 

median_speed = 48

len(mammals_df[(mammals_df.hoppers == True) & (mammals_df.speed > median_speed)])

7

In [31]:
# -What percentage is this?

round((len(mammals_df[(mammals_df.hoppers == True) & (mammals_df.speed > median_speed)]) / (len(mammals_df)) * 100), 2)

6.54