In [1]:
import pandas as pd
import numpy as np

from pydataset import data

import warnings
warnings.filterwarnings('ignore')

## 1. Create student grades DataFrame object

In [73]:
np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

### Peek at DataFrame

In [74]:
df.head()

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [75]:
df.shape

(12, 4)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
name       12 non-null object
math       12 non-null int64
english    12 non-null int64
reading    12 non-null int64
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [77]:
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


### a. Create a column named `passing_english` that indicates whether each student has a passing grade in reading.

In [78]:
# create boolean values

df.english >= 70

0      True
1      True
2      True
3      True
4      True
5      True
6     False
7     False
8     False
9      True
10     True
11    False
Name: english, dtype: bool

In [79]:
# assign the series to a new column we create in our DataFrame

df['passing_english'] = df.english >= 70

In [80]:
df.head()

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True


In [81]:
# How many people are passing English?

sum(df['passing_english'] == True)

8

In [82]:
# How many students are failing English?

sum(df['passing_english'] == False)

4

### b. Sort the english grades by the `passing_english` column. How are duplicates handled?

- `.sort_values` returns a sorted copy of a given DataFrame unless inplace=True

- It looks like duplicate values are handled according to the index value, small to large or ascending.

In [83]:
# DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
# sorts all of the rows in the DF using the column passed

df.sort_values(by='passing_english')

Unnamed: 0,name,math,english,reading,passing_english
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
11,Alan,92,62,72,False
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True


### c. Sort the english grades first by `passing_english` and then by student `name`. 

All the students that are failing english should be first, and within the students that are failing english they should be ordered alphabetically. The same should be true for the students passing english. 

(Hint: you can pass a list to the .sort_values method)

In [84]:
# Now we see that Alan comes before Albert because there is a secondary sort going on alphabetically by name

df.sort_values(by=['passing_english', 'name'])

Unnamed: 0,name,math,english,reading,passing_english
11,Alan,92,62,72,False
8,Albert,92,62,87,False
7,Marie,93,63,90,False
6,Thomas,82,64,81,False
4,Ada,77,92,98,True
3,Billy,98,96,88,True
10,Isaac,92,99,93,True
1,Jane,88,79,67,True
5,John,79,76,93,True
9,Richard,69,80,94,True


### d. Sort the english grades first by `passing_english`, and then by the actual `english` grade, similar to how we did in the last step.

In [85]:
df.sort_values(by=['passing_english', 'english'])

Unnamed: 0,name,math,english,reading,passing_english
8,Albert,92,62,87,False
11,Alan,92,62,72,False
7,Marie,93,63,90,False
6,Thomas,82,64,81,False
2,Suzie,94,74,95,True
5,John,79,76,93,True
1,Jane,88,79,67,True
9,Richard,69,80,94,True
0,Sally,62,85,80,True
4,Ada,77,92,98,True


### e. Calculate each students overall grade and add it as a column on the dataframe. The overall grade is the average of the math, english, and reading grades.

In [86]:
# axis = 1 sums the columns in each row and returns a Pandas Series

df.sum(axis=1)

0     228.0
1     235.0
2     264.0
3     283.0
4     268.0
5     249.0
6     227.0
7     246.0
8     241.0
9     244.0
10    285.0
11    226.0
dtype: float64

In [87]:
# assign Series back to DataFrame as overall_grade

df['overall_average'] = round(df.sum(axis=1) / 3, 2)

In [88]:
df.head()

Unnamed: 0,name,math,english,reading,passing_english,overall_average
0,Sally,62,85,80,True,76.0
1,Jane,88,79,67,True,78.33
2,Suzie,94,74,95,True,88.0
3,Billy,98,96,88,True,94.33
4,Ada,77,92,98,True,89.33


## 2. Load the mpg dataset. Read the documentation for the dataset and use it for the following questions:

- What is fl:

In [92]:
data('mpg', show_doc=True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [112]:
mpg = data('mpg')

In [113]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


### How many rows and columns are there?

In [114]:
mpg.shape

(234, 11)

### What are the data types of each column?

In [115]:
mpg.dtypes

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object

### Summarize the dataframe with .info and .describe


In [116]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
manufacturer    234 non-null object
model           234 non-null object
displ           234 non-null float64
year            234 non-null int64
cyl             234 non-null int64
trans           234 non-null object
drv             234 non-null object
cty             234 non-null int64
hwy             234 non-null int64
fl              234 non-null object
class           234 non-null object
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB


In [117]:
mpg.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


### Rename the cty column to city and hwy to highway using .rename()

In [120]:
mpg.rename(columns={'cty': 'city', 'hwy': 'highway'}, inplace=True)

In [121]:
mpg.head(1)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact


### Another way to rename columns...

In [137]:
mpg = data('mpg')

In [138]:
mpg.columns

Index(['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty',
       'hwy', 'fl', 'class'],
      dtype='object')

In [139]:
mpg.columns = ['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'city',
       'highway', 'fl', 'class']

In [140]:
mpg.head(1)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact


### Do any cars have better city mileage than highway mileage?

In [141]:
mpg[mpg.city > mpg.highway]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class


### Create a column named mileage_difference this column should contain the difference between highway and city mileage for each car.

In [142]:
# Create the Series with the difference in mileage

mpg.highway - mpg.city

1      11
2       8
3      11
4       9
5      10
       ..
230     9
231     8
232    10
233     8
234     9
Length: 234, dtype: int64

In [143]:
# Assign the Series to the mpg DataFrame as mileage_difference

mpg['mileage_difference'] = mpg.highway - mpg.city

In [144]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10


### Which car (or cars) has the highest mileage difference?

In [145]:
max(mpg.mileage_difference)

12

In [149]:
# Create the boolean values

mpg.mileage_difference == max(mpg.mileage_difference)

1      False
2      False
3      False
4      False
5      False
       ...  
230    False
231    False
232    False
233    False
234    False
Name: mileage_difference, Length: 234, dtype: bool

In [148]:
# wrap the boolean values in the mpg DataFrame to find the True observations

mpg[mpg.mileage_difference == max(mpg.mileage_difference)]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


### Which compact class car has the lowest highway mileage? The best?

In [159]:
# Why doesn't this work???

mpg.class

SyntaxError: invalid syntax (<ipython-input-159-d98911138d03>, line 1)

In [156]:
mpg.columns

Index(['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'city',
       'highway', 'fl', 'class', 'mileage_difference'],
      dtype='object')

In [154]:
mpg['class'].value_counts()

suv           62
compact       47
midsize       41
subcompact    35
pickup        33
minivan       11
2seater        5
Name: class, dtype: int64

In [173]:
# Create a Series of boolean values for compact class of cars

mpg['class'] == 'compact'

1       True
2       True
3       True
4       True
5       True
       ...  
230    False
231    False
232    False
233    False
234    False
Name: class, Length: 234, dtype: bool

In [178]:
# Wrap the boolean Series in the original DataFrame

mpg[mpg['class'] == 'compact']

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10
6,audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact,8
7,audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact,9
8,audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact,8
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact,9
10,audi,a4 quattro,2.0,2008,4,manual(m6),4,20,28,p,compact,8


In [179]:
# sort the DataFrame by highway mileage

mpg[mpg['class'] == 'compact'].sort_values(by='highway')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
220,volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact,7
221,volkswagen,jetta,2.8,1999,6,manual(m5),f,17,24,r,compact,7
212,volkswagen,gti,2.8,1999,6,manual(m5),f,17,24,r,compact,7
172,subaru,impreza awd,2.5,2008,4,manual(m5),4,19,25,p,compact,6
170,subaru,impreza awd,2.5,2008,4,auto(s4),4,20,25,p,compact,5
15,audi,a4 quattro,3.1,2008,6,manual(m6),4,15,25,p,compact,10
14,audi,a4 quattro,3.1,2008,6,auto(s6),4,17,25,p,compact,8
13,audi,a4 quattro,2.8,1999,6,manual(m5),4,17,25,p,compact,8
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact,9
12,audi,a4 quattro,2.8,1999,6,auto(l5),4,15,25,p,compact,10
