In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

### 1. A bank found that the average number of cars waiting during the noon hour at a drive-up window follows a Poisson distribution with a mean of 2 cars. Make a chart of this distribution and answer these questions concerning the probability of cars waiting at the drive-up window.

In [54]:
rate = 2 # cars waiting at noon 

#### * What is the probability that no cars drive up in the noon hour?

In [3]:
stats.poisson(rate).pmf(0)

0.1353352832366127

#### * What is the probability that 3 or more cars come through the drive through?

In [4]:
stats.poisson(rate).sf(2)

0.32332358381693654

#### * How likely is it that the drive through gets at least 1 car?

In [41]:
1 - stats.poisson(rate).pmf(0)

0.8646647167633873

### 2. Grades of State University graduates are normally distributed with a mean of 3.0 and a standard deviation of .3. Calculate the following:

In [6]:
mean = 3
std = .3
grades = stats.norm(mean, std)

#### * What grade point average is required to be in the top 5% of the graduating class?

In [7]:
grades.isf(.05)

3.4934560880854417

#### * What GPA constitutes the bottom 15% of the class?

In [8]:
grades.ppf(.15)

2.689069983151863

#### * An eccentric alumnus left scholarship money for students in the third decile from the bottom of their class. Determine the range of the third decile. Would a student with a 2.8 grade point average qualify for this scholarship?

In [9]:
lower_bound_third_decile = grades.ppf(.2)
upper_bound_third_decile = grades.ppf(.3)

print(f"{lower_bound_third_decile} - {upper_bound_third_decile}")
# A student with a 2.8 gpa would qualify for the scholarship

2.7475136299281258 - 2.842679846187588


In [42]:
# Alternative solution
grades.ppf([.2, .3])

array([2.74751363, 2.84267985])

#### * If I have a GPA of 3.5, what percentile am I in?

In [10]:
1 - grades.sf(3.5)
# ~95th percentile

0.9522096477271853

In [43]:
# Alternative solution
grades.cdf(3.5)

0.9522096477271853

### 3. A marketing website has an average click-through rate of 2%. One day they observe 4326 visitors and 97 click-throughs. How likely is it that this many people or more click through?

In [56]:
probability = click_through_rate = .02
n_trials = visitors = 4326

stats.binom(n_trials, probability).sf(96)

0.1397582363130086

In [57]:
# Alternative solution
rate = n_trials * probability

stats.poisson(rate).sf(96)

0.14211867659283192

### 4. You are working on some statistics homework consisting of 100 questions where all of the answers are a probability rounded to the hundreths place. Looking to save time, you put down random probabilities as the answer to each question.

In [58]:
probability = chance_of_guessing = 1/100
n_trials = questions_answered = 60

#### * What is the probability that at least one of your first 60 answers is correct?

In [59]:
stats.binom(n_trials, probability).sf(0)

0.4528433576092388

### 5. The codeup staff tends to get upset when the student break area is not cleaned up. Suppose that there's a 3% chance that any one student cleans the break area when they visit it, and, on any given day, about 90% of the 3 active cohorts of 22 students visit the break area. How likely is it that the break area gets cleaned up each day?

In [67]:
probability = chance_of_cleaning = .03
n_trials = visitors = .9 * (22 * 3)

daily_chance_of_cleaning = stats.binom(n_trials, probability).sf(0)
daily_chance_of_cleaning

0.8342199288437355

#### * How likely is it that it goes two days without getting cleaned up?

In [65]:
outcomes = [0, 1] # 1 is cleaned
n_trials = days_straight = 2
n_simulations = 100_000

sims = np.random.choice(outcomes, size = (n_simulations, n_trials), p = [1 - daily_chance_of_cleaning, daily_chance_of_cleaning])
no_cleaning_2 = sims.sum(axis = 1) == 0
no_cleaning_2.mean()

0.02769

In [68]:
# Alternative solution
# Increase trials by number of days
stats.binom(n_trials * 2, probability).pmf(0)

0.026821436301629097

#### * All week?

In [71]:
sims = np.random.choice(outcomes, size = (n_simulations, 5), p = [1 - daily_chance_of_cleaning, daily_chance_of_cleaning])

no_cleaning_5 = sims.sum(axis = 1) == 0
no_cleaning_5.mean()

0.00015

In [70]:
# Alternative solution
stats.binom(n_trials * 5, probability).pmf(0)

0.00011781621791055166

### 6. You want to get lunch at La Panaderia, but notice that the line is usually very long at lunchtime. After several weeks of careful observation, you notice that the average number of people in line when your lunch break starts is normally distributed with a mean of 15 and standard deviation of 3. If it takes 2 minutes for each person to order, and 10 minutes from ordering to getting your food, what is the likelihood that you have at least 15 minutes left to eat your food before you have to go back to class? Assume you have one hour for lunch, and ignore travel time to and from La Panaderia.

In [73]:
# Find the probability of the maximum number of people being in line that leaves 15 mins to eat where:  max_people = (45 - 10) mins / (2 mins / person) => 17.5 people = 17 round down =>  17 people - 1 (yourself) = 16
average_number_in_line = 15 # people
std = 3

prob_16_or_less = stats.norm(average_number_in_line, std).cdf(16)
prob_16_or_less

0.6305586598182363

In [75]:
# Alternative solution
# Convert to time
mean = 30 # mins per person
std = 6 # mins

stats.norm(mean, std).cdf(33)  # 60 - 15 - 10 - 2(your order time) = 33

0.6914624612740131

### 7. Connect to the employees database and find the average salary of current employees, along with the standard deviation. For the following questions, calculate the answer based on modeling the employees salaries with a normal distribution defined by the calculated mean and standard deviation then compare this answer to the actual values present in the salaries dataset.

In [18]:
from env import host, user, password, get_db_url

employees_select = """SELECT *
                        FROM employees
                            JOIN salaries USING(emp_no)
                        WHERE to_date > now()"""

employees_df = pd.read_sql(employees_select, get_db_url(user, password, host, "employees"))
mean = employees_df.salary.agg("mean")
std = employees_df.salary.agg("std")
print(f"mean:  {mean}, std:  {std}")

salary_distribution = stats.norm(mean, std)

mean:  72012.23585730705, std:  17309.99538025198


#### * What percent of employees earn less than 60,000?

In [19]:
calc_percent_under_60 = salary_distribution.cdf(59_999)

In [20]:
employees_df['under_60'] = employees_df.salary < 60000

In [21]:
actual_percent_under_60 = employees_df.under_60.sum() / len(employees_df)

print(f"Calculated:  {calc_percent_under_60}, Actual:  {actual_percent_under_60}")

Calculated:  0.2438393099761686, Actual:  0.2705727041028802


#### * What percent of employees earn more than 95,000?

In [22]:
calc_percent_over_95 = salary_distribution.sf(95_000)

In [23]:
employees_df['over_95'] = employees_df.salary > 95000

In [24]:
actual_percent_over_95 = employees_df.over_95.sum() / len(employees_df)

print(f"Calculated:  {calc_percent_over_95}, Actual:  {actual_percent_over_95}")

Calculated:  0.09208819199804053, Actual:  0.1086272092752078


#### * What percent of employees earn between 65,000 and 80,000?

In [25]:
calc_between_65_80 = salary_distribution.cdf(79_999) - salary_distribution.cdf(65_001)

In [77]:
# Alternative solution
np.diff(salary_distribution.cdf([65_001, 79_999]))

array([0.33501862])

In [26]:
employees_df['between_65_80'] = (employees_df.salary > 65000) & (employees_df.salary < 80000)

In [27]:
actual_between_65_80 = employees_df.between_65_80.sum() / len(employees_df)

print(f"Calculated:  {calc_between_65_80}, Actual:  {actual_between_65_80}")

Calculated:  0.33501861739056393, Actual:  0.325107028035515


#### * What do the top 5% of employees make?

In [28]:
calc_top_5_percent = salary_distribution.isf(.05)

In [29]:
print(round(len(employees_df) * .05, 0))

12006.0


In [30]:
actual_top_5_percent = employees_df.salary.sort_values(ascending = False).head(12006).sort_values(ascending = True).head(1)

print(f"Calculated:  {calc_top_5_percent}, Actual:  {actual_top_5_percent.values}")

Calculated:  100484.64454102777, Actual:  [104228]


In [78]:
# Alternative soultion
employees_df.salary.quantile(.95)

104225.84999999998