### IMPORT STATEMENTS

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
#import viz # curriculum viz example code
from env import host, user, password
import pandas as pd

np.random.seed(123)

### NORMAL DISTRIBUTION

-it models a continuous random variable where the further away from the mean you are, the less likely the outcome.

-commonly referred to as the bell curve

-defined by a mean and standard deviation

-the Standard Normal Distribution has a mean of 0 and stdev of 1

In [None]:
mean = 
stdev = 

# theoretical
my_dist = stats.norm(mean, stdev)
my_dist.func()      # func = .pdf() , .pmf() , .cdf() , .ppf() , .sf() , .isf()

# example
mean = 3
stdev = .3
grades_dist = stats.norm(mean, stdev)                   # distribution object
grades_dist.isf(.05)   # input prob (greater than top 5 %) return gpa value

# experiemental python
np.random.normal(mean, stdev, size)                     # numpy array 

# example
mean = 3
stdev = .3
size = 100_000
gpa_dist = np.random.normal(mean_gpa, stdev_gpa, size)   
top_5_gpa = np.quantile(gpa_dist,.95)   # return gpa cutoff for top 5%
top_5_gpa

### BINOMIAL DISTRIBUTION

-lets us model the number of successes after a number of trials given a certain probabilty of success.

-each trial is independent of the others

-example: number of heads you would expect to see after a certain number of trials


In [None]:
n_trials =
p_success =

# theoretical
my_dist = stats.binom(n_trials, p_success)               # distribution object
my_dist.func()

# exmaple
n_trials = 60                          # 60 questions
p_success = 1/100                      # prob of success

corr_answers = stats.binom(60, 1/101)
corr_answers.sf(0)                     # prob of getting at least 1 answer correct

# experiemental python
np.random.binomial(n_trials, p_success, size)           # numpy array

# example
n_trials = 30
p_success = .25
size = 100_000
(np.random.binomial(n_events, p_success , size)>=97).mean() # prob of 97 or more clicks

### POISSON DISTRIBUTION

-lets us model a situation where a certain number of events happens over a specified time interval.

-the number of events that happens is a discrete measure and tells us the likelihood of a certain number of events occuring over the time period.

-assumes the events are independent of each other and of the time since the last event

-ex. number of emails sent by a mail server in a day, number of phone calls received by a call center per hour, etc.

In [None]:
mean = 

# theoretical
my_dist = stats.poisson(mean)                            # distribution object
my_dist.func()

# example
mean = 2
cars_dist = stats.poisson(2)  # mean cars per hour 2
cars_dist.pmf(0)              # prob of 1 or more cars in an hour

# experimental python
np.random.poisson(mean, n_trials)                        # numpy array

# example
mean_cars = columns = 2
n_trials = nrows = 10_000 

cars = np.random.poisson(mean_cars, n_trials)
cars

(cars == 0).mean()          # prob of no cars coming in an hour

### UNIFORM DISTRIBUTION

-Can be used to model events where the outcome is discrete.

-Each outcome has an equally likely chance of happening.

In [None]:
die_distribution = stats.randint(1, 7)

die_distribution.rvs()           # Return a single random value from above dist
die_distribution.rvs(5)          # Return 5 random values
die_distribution.rvs((5, 5))     # Return a matrix of random values

### DISTRIBUTION METHODS

In [None]:
# Generate random values
.rvs()

# Probability Mass Function / Probability Density Function
=
.pmf() - accepts a single value and returns probability of observing exact value
.pdf() - accepts a probability and returns exact value with that probability

# Cumulative Density Function / Percent Point Function (Quantile Function)
<=
.cdf() - accepts a single value and returns probability of observing <= value
.ppf() - accepts probabilty and returns a single value

# Survival Function / Inverse Survival Function
## 1 - cdf(5) = sf(5)
>
.sf() - accepts a single value and returns probability of observing > value
.isf() - accepts a probability and returns single value

### VISUALIZE

In [None]:
n = 10_000

x = die_distribution.rvs(n)

plt.hist(x, bins=range(0, 9), align='left', width=1, edgecolor='black')
plt.title(f'Outcome of {n:,} Dice Rolls')

### One Sample T-Test

-lets us compare the mean for a specific subgroup against the population mean.

-lets us compare a categorical and continuous variable by using subgroup and pop means.

-Null Hypothesis: there is no difference in the means of our subgroup and the population.

-We assume that the continuous variable is normally distributed.

-Ex. Are the salaries of the marketing department higher than the company average, Are sales for product A higher when we rn a promo for it, etc.

In [None]:
# pass sequence of values and a float
sample_observation_means = (array)
popmeans = (float)      # array can be passed but it performs a ttest on every popmean
stats.ttest_1samp(sample_observation_means, popmeans)

# example
lefties = df[df.handedness == 'left']         # Pandas DF of left-handed students only

μ = df.exam_score.mean()                      # mean of total population exam scores
xbar = lefties.exam_score.mean()              # mean of subgroup (lefties) exam scores
t, p = stats.ttest_1samp(lefties.exam_score, df.exam_score.mean())

########################################## OR ###########################################
# example without arrays
office1_mean = 90
stdev1 = 15
n1 = 40
office2_mean = 100
stdev2 = 20
n2 = 50

t = (100 - 90) / (stdev2 / sqrt(n2))
print(f"t-score = {t}")

p = stats.t(n2-1).sf(t) * 2
print(f"p-value = {p}")

### Two Sample T-Test

-lets us compare between two different subgroups

-Ex. Are the exam scores of those who studied with flashcards the same as those who didn't study with flashcards?

In [None]:
# pass sequence of values and sequence of values
x1 = subgroup1_means = (array)
x2 = subgroup2_means = (array)
stats.ttest_ind(x1, x2)
