In [3]:
import sys
!{sys.executable} -m pip install pymongo
!{sys.executable} -m pip install pyyaml
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install moment
!{sys.executable} -m pip install statsmodels
import os
print(os.getcwd())
# Get Mongo database
from yaml import load
from pymongo import MongoClient
from getsecret import getsecret

client = MongoClient(getsecret("MONGODB_URI"))
db = client[getsecret("DB_NAME")]
# Get all synced accounts and their respective users.
import urllib.request as req
import json
accounts = json.loads(req.urlopen("http://localhost:5000/synced_emails").read().decode("utf-8"))

/Users/drewgregory/JupyterNotebooks/habitlab-data-analysis


In [4]:
# Get all sessions:
sessions = {}
for account in accounts:
    for user in account["android"]:
        for session in db[user + "_sessions"].find({"enabled": {"$exists": True}}):
            if user not in sessions:
                sessions[user] = {}
            domain = session["domain"]
            if domain not in sessions[user]:
                sessions[user][domain] = []
            sessions[user][domain].append(session)

In [6]:
# Now, check out if the sessions follow any distribution
session_durations = []
for user in sessions:
    for goal in sessions[user]:
        session_durations.extend([session["duration"] for session in sessions[user][goal]])

In [10]:
%matplotlib inline

import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels as sm
import matplotlib
import matplotlib.pyplot as plt

def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Distributions to check
    DISTRIBUTIONS = [        
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for distribution in DISTRIBUTIONS:
        print(distribution)

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                    end
                except Exception:
                    pass

                # identify if this distribution is better
                if best_sse > sse > 0:
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass

    return (best_distribution.name, best_params)

In [11]:
print(str(best_fit_distribution(session_durations)))

<scipy.stats._continuous_distns.alpha_gen object at 0x12d629390>
<scipy.stats._continuous_distns.anglit_gen object at 0x12d6295f8>
<scipy.stats._continuous_distns.arcsine_gen object at 0x12d629c18>
<scipy.stats._continuous_distns.beta_gen object at 0x12d629be0>
<scipy.stats._continuous_distns.betaprime_gen object at 0x12d629eb8>
<scipy.stats._continuous_distns.bradford_gen object at 0x12d6351d0>
<scipy.stats._continuous_distns.burr_gen object at 0x12d635438>
<scipy.stats._continuous_distns.cauchy_gen object at 0x12d635c18>
<scipy.stats._continuous_distns.chi_gen object at 0x12d635eb8>
<scipy.stats._continuous_distns.chi2_gen object at 0x12d642128>
<scipy.stats._continuous_distns.cosine_gen object at 0x12d2fdc50>
<scipy.stats._continuous_distns.dgamma_gen object at 0x12d642908>
<scipy.stats._continuous_distns.dweibull_gen object at 0x12d6427f0>
<scipy.stats._continuous_distns.erlang_gen object at 0x12d6e4630>
<scipy.stats._continuous_distns.expon_gen object at 0x12d642a58>
<scipy.stats.

In [17]:
# WALD PARAMS: (-1435368.1650060418, 4743374.936502483)
from scipy.stats import wald, kstest
params = wald.fit(session_durations)
kstest(session_durations, 'wald', params)

KstestResult(statistic=0.8233338587441226, pvalue=0.0)

In [31]:
from statistics import mean
import moment
# what about total time per app per user per day??
sessions_per_day = []
for user in sessions:
    for goal in sessions[user]:
        prev_day = ""
        app_time_day = 0
        for session in sessions[user][goal]:
            day = moment.unix(session["timestamp"]).format("YYYY-MM-DD")
            if day != prev_day:
                if prev_day != "":
                    app_time_day += session["duration"]
                    sessions_per_day.append(app_time_day)
                    app_time_day = 0
                prev_day = day

In [32]:
print(sessions_per_day)
print(str(best_fit_distribution(sessions_per_day)))

[8, 0, 13, 7, 0, 1, 40, 21, 919, 1, 1, 1, 1, 3, 23, 86, 7, 27, 54, 5, 4, 35, 10, 272, 19, 4, 173, 82, 341, 65, 48, 146, 33, 63, 1, 0, 2, 183, 4, 3, 40, 12, 1659, 19, 122, 1, 9, 212, 16, 129, 6, 5, 29, 4, 13, 12, 81, 36, 13, 35, 19, 5, 124, 27, 12, 4, 92, 105, 44, 50, 21, 76, 1, 18, 12, 1, 27, 30, 37, 60, 466, 9, 3, 3, 45, 2, 8, 8, 8, 865, 28, 74, 139, 82, 24, 59, 90, 30, 3, 19, 109, 8, 605, 129, 1259, 28647, 32678, 1, 31417, 3, 2, 3, 2, 10, 2, 4, 3, 11, 1, 1, 4, 6, 1, 10, 1, 2541, 23, 0, 21, 15, 6, 16, 25, 28, 119, 22, 50, 15, 32, 69, 1, 315, 270, 29, 8, 19, 1, 9, 8, 7, 6, 7, 7, 2, 0, 1, 21, 12, 22, 36, 14, 9, 7, 20, 2, 18, 0, 22, 9, 15, 11, 12, 105, 7, 6, 5, 6, 29, 17, 1, 60, 17, 10, 23, 3, 24, 170, 12, 5, 19, 4, 57, 12, 68, 19, 134, 61, 5, 6, 10, 23, 12, 7, 1, 19, 40, 44, 49, 3, 1, 70, 81, 35, 5, 20, 6, 10, 11, 15, 8, 14, 39, 224, 3, 134, 34, 89, 7, 0, 691, 28, 248, 81, 21, 22, 11, 2, 43, 43, 9, 7, 6, 136, 18, 11, 73, 5, 204, 924, 81, 49, 11, 4, 14, 42, 106, 90, 15, 60, 48, 26, 9, 8,

<scipy.stats._continuous_distns.beta_gen object at 0x12d629be0>
<scipy.stats._continuous_distns.betaprime_gen object at 0x12d629eb8>
<scipy.stats._continuous_distns.bradford_gen object at 0x12d6351d0>
<scipy.stats._continuous_distns.burr_gen object at 0x12d635438>
<scipy.stats._continuous_distns.cauchy_gen object at 0x12d635c18>
<scipy.stats._continuous_distns.chi_gen object at 0x12d635eb8>
<scipy.stats._continuous_distns.chi2_gen object at 0x12d642128>
<scipy.stats._continuous_distns.cosine_gen object at 0x12d2fdc50>
<scipy.stats._continuous_distns.dgamma_gen object at 0x12d642908>
<scipy.stats._continuous_distns.dweibull_gen object at 0x12d6427f0>
<scipy.stats._continuous_distns.erlang_gen object at 0x12d6e4630>
<scipy.stats._continuous_distns.expon_gen object at 0x12d642a58>
<scipy.stats._continuous_distns.exponnorm_gen object at 0x12d642c88>
<scipy.stats._continuous_distns.exponweib_gen object at 0x12d642f60>
<scipy.stats._continuous_distns.exponpow_gen object at 0x12d6475c0>
<scip

In [33]:
kstest(session_durations, 'gompertz', (28810381.7784205, -6.364605262441564e-05, 10550639404.663002))

KstestResult(statistic=0.7141411023892895, pvalue=0.0)