In [203]:
%matplotlib inline
import numpy as np
import pandas as pd
from math import factorial

import viz # curriculum example visualizations

np.random.seed(100)

# Questions to ask during lecture review:

# Exercises

## 1. How likely is it that you roll doubles when rolling two dice?

In [204]:
#Establishing first the number of trials (the higher the number of trials, the more accurate our probability estimate)
n_trials = nrows = 10_000

n_dice = ncols = 2

rolls = np.random.choice([1, 2, 3, 4, 5, 6], n_trials * n_dice).reshape(nrows, ncols)
rolls

rolls = pd.DataFrame(rolls)

#I need to remember to include the 'axis = 1' argument for .apply() to ensure that the function hits the *rows* rather than the coluns (the default)
rolls.apply((lambda row: row[0] == row[1]), axis = 1).mean()

#Another angle to tackle this would be to go through the columns (i.e. *not* using axis = 1)
(rolls[0] == rolls[1]).mean()
#Again, note that the 'axis = 1' argument is not here, because I'm operating on the column

0.1652

## 2. If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [208]:


n_flips = 10_000
n_coins = 8

coin_flip = np.random.choice([0, 1], n_flips * n_coins).reshape(n_flips, n_coins)

coin_flip = pd.DataFrame(coin_flip)
coin_flip

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,1,0,0,1,0,0,0
1,0,0,0,0,0,1,1,0
2,1,0,0,1,1,0,0,1
3,1,1,0,0,1,1,0,1
4,1,0,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...
9995,1,0,1,1,0,1,0,1
9996,0,1,1,1,0,1,0,0
9997,0,0,1,0,0,1,1,0
9998,0,1,0,0,0,0,1,0


In [212]:
#The probability of getting exactly 3 heads
coin_flip.apply((lambda row: row.sum() == 3), axis=1).mean()

#Or, Ryan showed me that I can just apply the .sum() to the DF itself (axis = 1)
(coin_flip.sum(axis = 1) == 3).mean()

0.2233

In [215]:
#Probability of getting more than 3 heads
coin_flip.apply((lambda row: row.sum() > 3), axis=1).mean()

0.6259

## 3. There are approximately 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alum to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [216]:
n_trips = 10_000
n_boards = 2

board_alum = np.random.choice([0, 0, 0, 1], n_trips * n_boards).reshape(n_trips, n_boards)

board_alum = pd.DataFrame(board_alum)

# The probability that both are DS students
board_alum.apply((lambda row: row.sum() == 2), axis=1).mean()

#Shorthand of the above:
(board_alum.sum(axis = 1) == 2).mean()

0.0635

In [219]:
#I can also rewrite the initial formula using the probability argument for np.random.choice:
board_alum = np.random.choice([0, 1], n_trips * n_boards, p = [.75, .25]).reshape(n_trips, n_boards)
(board_alum.sum(axis = 1) == 2).mean()

0.0679

# 4. Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon? (Remember, if you have mean and standard deviation, use the np.random.normal)

In [225]:
n_checks = 10000
n_days = 5

pop_tarts = (np.random.normal(3, 1.5, n_checks * n_days)).reshape(n_checks, n_days).round()

pop_tarts = pd.DataFrame(pop_tarts)

(pop_tarts.sum(axis = 1) < 17).mean()

0.6771

# 5. Compare Heights
- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- Since you have means and standard deviations, you can use np.random.normal to generate observations.
- If a man and woman are chosen at random, what is the likelihood the woman is taller than the man?

In [226]:
n_checks = 10000
n_people = 1

man = (np.random.normal(178, 8, n_checks * n_people)).reshape(n_checks, n_people)
man = pd.DataFrame(man)

woman = (np.random.normal(170, 6, n_checks * n_people)).reshape(n_checks, n_people)
woman = pd.DataFrame(woman)

sample = pd.concat([man, woman], axis = 1, keys = ['Man', 'Woman'])
#sample

(sample['Man'] < sample['Woman']).mean()

0    0.2099
dtype: float64

# 6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. 

## a. What are the odds that after having 50 students download anaconda, no one has an installation issue?

In [12]:
n_installs = 10000
n_students = 50


#We can use 'p = [x, y]' to set the probabilities for the respective choices. 
install = (np.random.choice([0, 1], n_installs * n_students, p = [.996, .004])).reshape(n_installs, n_students)

#This will calculate the likelihood that we *will* have a failure:
(install.sum(axis = 1) > 0).mean()

#So we need to subtract it from 1 in order to get the probability that we *won't* have a failure:
(1- (install.sum(axis = 1) > 0).mean())

0.8243

## b. 100 students?

In [15]:
#Doing the same as above, only for 100 students
n_installs = 10000
n_students = 100

install = (np.random.choice([0, 1], n_installs * n_students, p = [.996, .004])).reshape(n_installs, n_students)
(1 - (install.sum(axis = 1) > 0).mean())

0.6669

## c. 150 students?

In [227]:
#Doing the same as above, only for 150 students
n_installs = 10000
n_students = 150

install = (np.random.choice([0, 1], n_installs * n_students, p = [.996, .004])).reshape(n_installs, n_students)
(1 - (install.sum(axis = 1) > 0).mean())

0.5507

## d. What is the probability that we observe an installation issue within the first 150 students that download anaconda?

In [22]:
#For this one, because I'm looking for the probability that there *is* a failure, I'm not going to subtract from 1:
n_installs = 10000
n_students = 150

install = (np.random.choice([0, 1], n_installs * n_students, p = [.996, .004])).reshape(n_installs, n_students)
(install.sum(axis = 1) > 0).mean()

0.4521

## e. How likely is it that 450 students all download anaconda without an issue?

In [228]:
#Again, because I'm looking for the probability there *won't* be a failure, I'll need to subtract from 1:
n_installs = 10000
n_students = 450

install = (np.random.choice([0, 1], n_installs * n_students, p = [.996, .004])).reshape(n_installs, n_students)
install.shape
(1 - (install.sum(axis = 1) > 0).mean())

0.1623

# 7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park.

## a. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [195]:
n_checks = 10000
n_days = 3

food_truck = np.random.choice([0, 1], n_days * n_checks, p = [.3, .7]).reshape(n_checks, n_days)
food_truck = pd.DataFrame(food_truck)

#Likelihood that the food truck will not show up for 3 days:
(food_truck.sum(axis = 1) == 0).mean()

#When it says 'how *unlikely* is this,' does that mean we need to subtract from 1 and say it's roughly 75% unlikely?

0.0257

## b. How likely is it that a food truck will show up sometime this week?

In [231]:
n_checks = 10000
n_days = 7

food_truck = np.random.choice([0, 1], n_days * n_checks, p = [.3, .7]).reshape(n_checks, n_days)
food_truck = pd.DataFrame(food_truck)

(food_truck.sum(axis = 1) > 0).mean()

#It is clearly VERY likely that the food truck will appear.

0.9994

# 8. If 23 people are in the same room, what are the odds that two of them share a birthday? 

In [234]:
n_checks = 10000
n_birthdays = 23

birthdays = np.random.choice((list(range(365))), n_checks * n_birthdays).reshape(n_checks, n_birthdays)

birthdays = pd.DataFrame(birthdays)
(birthdays.nunique(axis = 1) < 23).mean()

0.4993

# b. What if it's 20 people?

In [235]:
n_checks = 10000
n_birthdays = 20

birthdays = np.random.choice((list(range(365))), n_checks * n_birthdays).reshape(n_checks, n_birthdays)

birthdays = pd.DataFrame(birthdays)
(birthdays.nunique(axis = 1) < 20).mean()

0.4051

# c. What if it's 40 people?

In [236]:
n_checks = 10000
n_birthdays = 40

birthdays = np.random.choice((list(range(365))), n_checks * n_birthdays).reshape(n_checks, n_birthdays)

birthdays = pd.DataFrame(birthdays)
(birthdays.nunique(axis = 1) < 40).mean()

0.8872