## Questions

* How to effectively put a prediction interval around the median Lifetime Rental Count? 
* Should PI/CI be continually recalculated using the new "population data" or could this possibly conceal a trend over time? 

[CI Resource](https://onlinecourses.science.psu.edu/stat414/node/261)
[PI Resource](http://www.propharmagroup.com/blog/understanding-statistical-intervals-part-2-prediction-intervals/)

In [31]:
import pandas as pd
import datetime
import math

In [45]:
mdrDat_init = pd.read_csv('sql_runner_MDRS_2018-04-25.csv')
mdrDat_init['MoYr'] = pd.to_datetime(mdrDat_init['start_time']).apply(lambda x: str(x.month) + '-'+str(x.year))
mdrDat_init['WkYr'] = pd.to_datetime(mdrDat_init['start_time']).apply(lambda x: str(str(x.year)+'-'+ str(x.isocalendar()[1])))
mdrDat_init.columns

Index(['rentalid', 'accountid', 'accounttype', 'subscriptionid',
       'subscriptiontypegroup', 'discpartner', 'rentalaccessmethod',
       'countrentals', 'bikedisplay', 'bikeid', 'startstation_id',
       'rental_status', 'start_time', 'end_time', 'calcend_time',
       'nextdockerror_ts', 'minbiid', 'minbibid', 'minbirid', 'minbikeintime',
       'MoYr', 'WkYr'],
      dtype='object')

In [49]:
mdrDat_init['WkYr'].value_counts().median()

89.0

In [78]:
results = []

for loop in range(1000):
    sample = mdrDat_init.sample(n=89, axis=0)
    medianRidesb4MDR = sample['countrentals'].median()
    stdRidesb4MDR = sample['countrentals'].std()
    results.append([loop, medianRidesb4MDR, stdRidesb4MDR])

medianRideSampels = pd.DataFrame(results)
medianRideSampels.columns = ['Loop', 'Median', 'Stdev']
medianRideSampels

Unnamed: 0,Loop,Median,Stdev
0,0,3.0,266.569817
1,1,4.0,217.963518
2,2,7.0,335.378065
3,3,22.0,9273.226534
4,4,19.0,292.310965
5,5,12.0,244.635526
6,6,5.0,255.995164
7,7,7.0,265.011010
8,8,9.0,379.701761
9,9,23.0,528.258552


In [79]:
def calcPI(medianSampleSeries):
    """ t value 1.96 is used for t.95; can be adjusted """
    stdv = medianSampleSeries.std()
    n = len(medianSampleSeries)
    POPmedian = medianSampleSeries.median()
    CI95 = POPmedian + 2.04227*(stdv*math.sqrt(1 + 1/n))
    CI05 = POPmedian - 2.04227*(stdv*math.sqrt(1 + 1/n))
    
    return (CI05, CI95)

In [80]:
# this looks roughly like it's working 
CIs = calcPI(medianRideSampels['Median'])
CIs

(-8.61341776643161, 30.61341776643161)

In [81]:
withinPI = medianRideSampels[(medianRideSampels['Median'] >= CIs[0]) & (medianRideSampels['Median'] <= CIs[1])]
withinPI.shape[0]/medianRideSampels.shape[0]

0.9344

old

In [None]:
def ltTen(val):
    if val <= 10:
        yn = 1
    elif val > 10:
        yn = 0
    return yn

fullMDR_dat['CountRentals'].apply(lambda x: ltTen(x)).value_counts()

In [None]:
def calcRentalCountCI(medianSampleSeries):
    """ t value 1.96 is used for t.95; can be adjusted """
    stdv = medianSampleSeries.std()
    n = len(medianSampleSeries)
    POPmedian = medianSampleSeries.median()
    CI95 = POPmedian + 1.96*(stdv/math.sqrt(n))
    CI05 = POPmedian - 1.96*(stdv/math.sqrt(n))
    
    return (CI05, CI95)