## Questions

* How to effectively put a prediction interval around the median Lifetime Rental Count? 
* Should PI/CI be continually recalculated using the new "population data" or could this possibly conceal a trend over time? 

[CI Resource](https://onlinecourses.science.psu.edu/stat414/node/261)
[PI Resource](http://www.propharmagroup.com/blog/understanding-statistical-intervals-part-2-prediction-intervals/)

In [86]:
import pandas as pd
import datetime
import math

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
%matplotlib inline

In [89]:
mdrDat_init = pd.read_csv('sql_runner_MDRS_2018-04-25.csv')
mdrDat_init['MoNum'] = pd.to_datetime(mdrDat_init['start_time']).apply(lambda x: x.month)
mdrDat_init['YrNum'] = pd.to_datetime(mdrDat_init['start_time']).apply(lambda x: x.year)
mdrDat_init.columns

Index(['rentalid', 'accountid', 'accounttype', 'subscriptionid',
       'subscriptiontypegroup', 'discpartner', 'rentalaccessmethod',
       'countrentals', 'bikedisplay', 'bikeid', 'startstation_id',
       'rental_status', 'start_time', 'end_time', 'calcend_time',
       'nextdockerror_ts', 'minbiid', 'minbibid', 'minbirid', 'minbikeintime',
       'MoNum', 'YrNum'],
      dtype='object')

In [90]:
mdrDat_init

Unnamed: 0,rentalid,accountid,accounttype,subscriptionid,subscriptiontypegroup,discpartner,rentalaccessmethod,countrentals,bikedisplay,bikeid,...,start_time,end_time,calcend_time,nextdockerror_ts,minbiid,minbibid,minbirid,minbikeintime,MoNum,YrNum
0,21800194,627413,Member,814960.0,Annual,NYCHA New Members,Bike key,13,05334,20176,...,2015-06-13 02:11:24,2015-06-18 13:47:13,2015-06-18 13:47:13,2015-06-18 13:46:52,2707582.0,20176.0,21800194.0,2015-06-18 13:47:13,6,2015
1,30972922,811658,Member,1146001.0,Annual,Unknown,Bike key,180,11249,22989,...,2016-01-28 22:49:40,,2016-01-30 00:12:40,,11459019.0,22989.0,,2016-01-30 00:12:40,1,2016
2,29526727,878539,Casual,1263618.0,1-Day,Unknown,Release code - Credit card,2,02140,15463,...,2015-12-25 16:37:20,2015-12-27 17:01:40,2015-12-27 17:01:40,2015-12-25 19:16:29,10788284.0,15463.0,29526727.0,2015-12-27 17:01:40,12,2015
3,21951117,656017,Casual,863887.0,7-Day,Unknown,Release code - Credit card,1,03819,20020,...,2015-06-17 18:01:16,2015-06-20 21:05:20,2015-06-20 21:05:20,,2805895.0,20020.0,21951117.0,2015-06-20 21:05:20,6,2015
4,26314774,8943,Member,845834.0,Annual,Unknown,Bike key,381,03819,20020,...,2015-09-29 23:41:21,2015-10-02 17:55:55,2015-10-02 17:55:55,2015-09-29 23:45:04,7470149.0,20020.0,26314774.0,2015-10-02 17:55:55,9,2015
5,32382213,936050,Casual,1360028.0,1-Day,Unknown,Release code - Credit card,5,04024,20587,...,2016-04-18 22:17:17,2016-04-20 13:05:57,2016-04-20 13:05:57,2016-04-18 22:25:02,13912523.0,20587.0,32382213.0,2016-04-20 13:05:57,4,2016
6,38189398,828505,Member,1174695.0,Annual,CDCU New Members,Bike key,622,03003,15116,...,2016-08-16 15:11:47,,2016-08-17 17:49:43,,20114111.0,15116.0,,2016-08-17 17:49:43,8,2016
7,39736976,1159690,Casual,1772722.0,1-Day,Unknown,Release code - Credit card,9,00916,21117,...,2016-09-13 00:26:54,2016-09-30 02:16:10,2016-09-30 02:16:10,2016-09-13 05:13:21,22899652.0,21117.0,39736976.0,2016-09-30 02:16:10,9,2016
8,53467197,1467469,Member,2319319.0,1-Day,Unknown,Release code - Mobile,0,25430,29016,...,2017-07-18 20:53:43,,,,,,,,7,2017
9,25820855,58888,Member,942852.0,Annual,Unknown,Bike key,511,01690,20603,...,2015-09-20 18:21:51,2015-09-25 03:49:12,2015-09-25 03:49:12,2015-09-20 18:24:30,7108525.0,20603.0,25820855.0,2015-09-25 03:49:12,9,2015


In [78]:
results = []

for loop in range(1000):
    sample = mdrDat_init.sample(n=89, axis=0)
    medianRidesb4MDR = sample['countrentals'].median()
    stdRidesb4MDR = sample['countrentals'].std()
    results.append([loop, medianRidesb4MDR, stdRidesb4MDR])

medianRideSampels = pd.DataFrame(results)
medianRideSampels.columns = ['Loop', 'Median', 'Stdev']
medianRideSampels

Unnamed: 0,Loop,Median,Stdev
0,0,3.0,266.569817
1,1,4.0,217.963518
2,2,7.0,335.378065
3,3,22.0,9273.226534
4,4,19.0,292.310965
5,5,12.0,244.635526
6,6,5.0,255.995164
7,7,7.0,265.011010
8,8,9.0,379.701761
9,9,23.0,528.258552


In [79]:
def calcPI(medianSampleSeries):
    """ t value 1.96 is used for t.95; can be adjusted """
    stdv = medianSampleSeries.std()
    n = len(medianSampleSeries)
    POPmedian = medianSampleSeries.median()
    CI95 = POPmedian + 2.04227*(stdv*math.sqrt(1 + 1/n))
    CI05 = POPmedian - 2.04227*(stdv*math.sqrt(1 + 1/n))
    
    return (CI05, CI95)

In [80]:
# this looks roughly like it's working 
CIs = calcPI(medianRideSampels['Median'])
CIs

(-8.61341776643161, 30.61341776643161)

In [81]:
withinPI = medianRideSampels[(medianRideSampels['Median'] >= CIs[0]) & (medianRideSampels['Median'] <= CIs[1])]
withinPI.shape[0]/medianRideSampels.shape[0]

0.9344

old

In [None]:
def ltTen(val):
    if val <= 10:
        yn = 1
    elif val > 10:
        yn = 0
    return yn

fullMDR_dat['CountRentals'].apply(lambda x: ltTen(x)).value_counts()

In [None]:
def calcRentalCountCI(medianSampleSeries):
    """ t value 1.96 is used for t.95; can be adjusted """
    stdv = medianSampleSeries.std()
    n = len(medianSampleSeries)
    POPmedian = medianSampleSeries.median()
    CI95 = POPmedian + 1.96*(stdv/math.sqrt(n))
    CI05 = POPmedian - 1.96*(stdv/math.sqrt(n))
    
    return (CI05, CI95)