In [None]:
import sys
!{sys.executable} -m pip install pymongo
!{sys.executable} -m pip install pyyaml
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install pandas
import os
print(os.getcwd())



In [33]:
# Get Mongo database
from yaml import load
from pymongo import MongoClient
from getsecret import getsecret

client = MongoClient(getsecret("MONGODB_URI"))
db = client[getsecret("DB_NAME")]

In [34]:
# Get all synced accounts and their respective users.
import urllib.request as req
import json
accounts = json.loads(req.urlopen("http://localhost:5000/synced_emails").read().decode("utf-8"))

In [None]:
# Organize sessions by goal and frequent vs. infrequent for Android
goals = {}
print("Beginning")
for account in accounts:
    
    for user in account["android"]:
        print(user)
        for session in db[user + "_sessions"].find({"enabled": True}):
            domain = session["domain"]
            if domain not in goals:
                goals[domain] = {"freq":[], "infreq":[]}
            if session["frequent"]:
                goals[domain]["freq"].append(session)
            else:
                goals[domain]["infreq"].append(session)

In [None]:
import matplotlib.pyplot as pyplot
from statistics import mean, median
from scipy.stats import sem, norm 
# Now show graphs for each domain to compare frequent sessions vs infrequent sessions.
domain_stats = []
for domain in goals:
    if len(goals[domain]["freq"]) > 0 and len(goals[domain]["infreq"]) > 0:
        freq_durations = [session["duration"] for session in goals[domain]["freq"]]
        freq_avg = mean(freq_durations)
        freq_err = sem(freq_durations)
        infreq_durations = [session["duration"] for session in goals[domain]["infreq"]] 
        infreq_avg = mean(infreq_durations)
        infreq_err = sem(infreq_durations)
        domain_stats.append({"domain": domain, "freq": freq_avg, 
                             "infreq": infreq_avg, "freq_pts":len(goals[domain]["freq"]),
                            "infreq_pts":len(goals[domain]["infreq"]), "freq_err": freq_err,
                            "infreq_err": infreq_err})

        domain_stats = sorted(domain_stats, key=lambda stat: int(stat["freq_pts"] + stat["infreq_pts"]))
counter = 0
for stat in domain_stats:
    print(int(stat["freq_pts"] + stat["infreq_pts"]))
    print(stat['domain'] + " with " + str(stat["freq_pts"]) + " freq points " + " and " + str(stat["infreq_pts"]) + " infreq pts")
    print("freq avg: " + str(stat["freq"]))
    print("infreq avg: " + str(stat["infreq"]))
    print("")
    # If there is a statistically significant difference (95% conf intervals don't overlap), we will show the graph.
    if stat["freq"] - 1.64 * stat["freq_err"] > stat["infreq"] + 1.64 * stat["infreq_err"] or stat["infreq"] -  1.64 * stat["infreq_err"] > stat["freq"] + 1.64 * stat["freq_err"]:
        pyplot.figure(counter)
        counter += 1
        pyplot.title(stat["domain"])
        pyplot.ylabel('Average session duration')
        pyplot.bar(["sessions with frequent setting", "sessions with infrequent setting"], [stat["freq"], stat["infreq"]] , .8, yerr=[1.64 * stat["freq_err"], 1.64 * stat["infreq_err"]])
    

In [None]:
improvement_stats = {"freq": [], "infreq": []}
for account in accounts:
    for user in account["android"]:
        # We now want to see if they have goals that have alternated from freq to infreq.
        goals={}
        for session in db[user + "_sessions"].find({"enabled": True}):
            domain = session["domain"]
            if domain not in goals:
                goals[domain] = {"freq_duration":0, "freq_count": 0, "infreq_duration":0, "infreq_count":0}
            if session["frequent"]:
                goals[domain]["freq_duration"] += session["duration"]
                goals[domain]["freq_count"] += 1
            else:
                goals[domain]["infreq_duration"] += session["duration"]
                goals[domain]["infreq_count"] += 1
        # Compute averages
        for domain in goals:
            print(str(goals[domain]["freq_count"]) + " " + str(goals[domain]["infreq_count"]) )
            if goals[domain]["freq_count"] > 20 and goals[domain]["infreq_count"] > 20:
                freq_avg = goals[domain]["freq_duration"]/goals[domain]["freq_count"]
                infreq_avg = goals[domain]["infreq_duration"]/goals[domain]["infreq_count"]
                # print("for " + user +": " + str(freq_avg) + " " + str(infreq_avg))
                improvement_stats["freq"].append(freq_avg)
                improvement_stats["infreq"].append(infreq_avg)


In [None]:
from scipy.stats import ttest_ind
print(str(len(improvement_stats["freq"])))
print(str(improvement_stats))
results = ttest_ind(improvement_stats["freq"], improvement_stats["infreq"])
print(str(results))

In [None]:

# Now let's analyze which interventions are most effective.
import matplotlib.pyplot as pyplot
interventions = {}
for account in accounts:
    for user in account["android"]:
        for session in db[user + "_sessions"].find({"enabled": True}):
            if "interventions" in session:
                for intervention in session["interventions"]:
                    intervention_name = intervention["intervention"]
                    if intervention_name not in interventions:
                        interventions[intervention_name] = {"total": 0, "count": 0}
                    interventions[intervention_name]["total"] += session["duration"]
                    interventions[intervention_name]["count"] += 1 
"""
"""
pyplot.figure(counter)
counter += 1
pyplot.title('Average Session Duration When Assigned Interventions')
pyplot.ylabel('Average Session Duration (s)')
pyplot.xticks(rotation=90)
pyplot.bar([name for name in interventions], [interventions[name]["total"]/interventions[name]["count"] for name in interventions])


In [None]:
# Let's get the distribution of session lengths (hopefully log-norm for LMM)
import math
session_durations = []
for account in accounts:
    for user in account["android"]:
        for session in db[user + "_sessions"].find({"enabled": True}):
            if session["duration"] > 5 and "interventions" in session and "frequent" in session:
                session_durations.append({"frequent": session["frequent"], "duration": math.log(session["duration"]), "interventions": session["interventions"], "user": user, "goal": session["domain"]})
"""
pyplot.figure(counter)
counter += 1
pyplot.hist([session["duration"] for session in session_durations], bins=50)
from scipy.stats import shapiro
print(shapiro([session["duration"] for session in session_durations]))
"""

In [None]:
#Ehh.... our Shapiro test didn't fare too well, but we'll export to a CSV and let Geza run an LMM anyway.
import csv
with open('sessions_for_intervention.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['frequent', 'duration (log(s))', 'interventions', 'user', 'goal'])
    for session in session_durations:
        writer.writerow([session[name] for name in session])

In [None]:
# Now, let's see which version is responsible for the super low sessions (it was a bug)
version_counts = {}
for account in accounts:
    for user in account["android"]:
        for session in db[user + "_sessions"].find({"enabled": True, "version": {"$exists": True}}):
            version = session["version"]
            if version not in version_counts:
                version_counts[version] = {"small": 0, "all": 0}
            version_counts[version]["all"] += 1
            if session["duration"] < 1:
                version_counts[version]["small"] += 1
print([str(version) + ":" + str(version_counts[version]["small"]/version_counts[version]["all"]) + " " for version in version_counts])


In [None]:
# Now, let's do some clean up.
# ABORT: VERY FEW HABITLAB SESSIONS First, we should remove 0-length sessions. They really shouldn't happen.
# Next, we should also remove sessions that are longer than a day. That was because of a bug in my code.



In [None]:
# Find number of users who didn't make it past onboarding.
num_users = 0
num_quit_users = 0
collections = db.list_collection_names()
for account in accounts:
    for user in account["android"]:
        num_users += 1
        if user+"_sessions" not in collections:
            num_quit_users += 1
print("Total Android users: " + str(num_users))
print("Android users who didn't make it past onboarding: " + str(num_quit_users))
print("% of quitters: " + str(num_quit_users/num_users))

In [None]:
import pandas
from statistics import mean
# Analyze conservation within Android
total_sessions_per_day = 0
count_sessions_per_day = 0
for account in accounts:
    for user in account["android"]:
        sessions_per_day = {}
        for session in db[user + "_sessions"].find():
            ts = pandas.Timestamp(ts_input=session["timestamp"], unit="ms")
            day = str(ts.year) + str(ts.month) + str(ts.day)
            if day not in sessions_per_day:
                sessions_per_day[day] = 0
            sessions_per_day[day] += 1
        arr = [sessions_per_day[day] for day in sessions_per_day]
        if (len(arr) > 0):
            total_sessions_per_day += mean(arr)
            count_sessions_per_day += 1
print(str(total_sessions_per_day/count_sessions_per_day))
        
        

In [None]:
ANDROID = "android"
BROWSER = "browser"
SHARED = "shared"
SPEND_LESS_TIME_LENGTH = len("custom/spend_less_time_")
# Associate users with domain name which will function as our key.
def get_name(name, device):
    """
    @param name: goal name (package name for Android)
    @param device: "android" or "browser"
    @return name of goal with subdomains removed and goal annotation removed (i.e. spend_less_time)
    """
    name = name.lower()
    if device == ANDROID: 
        subs = list(filter(lambda x: x != "android" and x != "google", name.split('.')))
        if len(subs) < 2:
            return subs[0]
        return subs[1]
    # must be custom browser goal
    if "custom" in name:
        name = name[SPEND_LESS_TIME_LENGTH:]
        subs = name.split('.')
        if len(subs) < 2:
            return subs[0]
        return subs[len(subs) - 2]
    # Must be normal browser goal
    return name.split('/spend')[0]
        
    
def organize_stats(shared_goals, stats, device):
    """
    Organizes that stats object into shared_goals for device.
    @param shared_goals: dictionary
    @param stats:  stats object returned from freq_stats
    @param device: ANDROID or BROWSER
    """
    for iso in stats:
        for freq in stats[iso]:
            for goal in stats[iso][freq]:
                name = get_name(goal, device)
                if name not in shared_goals:
                    shared_goals[name] = {ANDROID: {}, BROWSER: {}}
                shared_goals[name][device][goal] = freq
                if len(shared_goals[name][ANDROID]) > 0 and len(shared_goals[name][BROWSER]) > 0:
                    shared_goals[SHARED].append(name)

"""
Unit Tests:
print(get_name("custom/spend_less_time_gizmodo.com", BROWSER))
print(get_name("facebook/spend_less_time", BROWSER))
print(get_name("com.duolingo", ANDROID))
print(get_name("com.android.chrome", ANDROID))
"""


In [None]:
# Analyze conservation across devices
shared_data = {}
for account in accounts:
    shared_goals = {SHARED: []}
    if len(account[ANDROID]) > 0 and len(account[BROWSER]) > 0: # It's possible they have overlapping goals.
        for user in account[ANDROID]:
            stats = json.loads(req.urlopen("http://localhost:5000/freq_stats_for_user?id="+user).read().decode("utf-8"))
            organize_stats(shared_goals, stats, ANDROID)
        for user in account[BROWSER]:
            stats = json.loads(req.urlopen('http://localhost:5000/freq_stats_for_user_browser?id='+user).read().decode("utf-8"))
            organize_stats(shared_goals, stats, BROWSER)
        for name in shared_goals[SHARED]:
            # Cool. Let's put them in the goals
            
            

In [22]:
# Let's find the ten most active users!!
ten_most_active_users = []
for account in accounts:
    for user in account["android"]:
        num_sessions = len([s for s in db[user + "_sessions"].find()])
        if len(ten_most_active_users) < 10 or ten_most_active_users[0]["sessions"] < num_sessions:
            user_obj = {"user": user, "sessions": num_sessions}
            print(user_obj)
            if (len(ten_most_active_users) < 10):
                ten_most_active_users.append(user_obj)
            else:
                ten_most_active_users[0] = user_obj
                ten_most_active_users = sorted(ten_most_active_users, key=lambda user: user["sessions"])

{'user': '9cb8c59eabbb60ad6447345c', 'sessions': 538}
{'user': 'b219926d16e6e46fa426cafd', 'sessions': 591}
{'user': 'U1532437453047928', 'sessions': 1844}
{'user': '010cf6179f52b3d62184ee17', 'sessions': 4301}
{'user': '0b0d1da5c036cb9637aff5d5', 'sessions': 1836}
{'user': 'f6c5e87bbe828ffc4278d4b7', 'sessions': 0}
{'user': '38a4b3e06ead18a13442f5a7', 'sessions': 1143}
{'user': 'f463f9d5191a4bc1d1374298', 'sessions': 52}
{'user': 'c4f80e629327886902828290', 'sessions': 0}
{'user': '7f2cc2756b7c8c658a86ce6c', 'sessions': 0}
{'user': 'ec0dbd76f4093d26176ba153', 'sessions': 1099}
{'user': '3d9e01f13145754f3ee22cdb', 'sessions': 3249}
{'user': 'c9991373fd8cd5ce69292511', 'sessions': 1303}
{'user': '741de6a9c48bdd2ec81e68cd', 'sessions': 348}
{'user': '87cd8b98991c55a2a3c0e93f', 'sessions': 5988}
{'user': 'e91d2715bdf89bad899ace3b', 'sessions': 4426}
{'user': 'bd64f8d80507c8b24a33cd88', 'sessions': 1444}
{'user': '034217b8528a36106114f877', 'sessions': 2658}
{'user': 'c1dac05a9d1ce5b860630

In [23]:
print(ten_most_active_users)

[{'user': '4aeda8a5b8cdebecaac7dd65', 'sessions': 4547}, {'user': '23cab28916f19d8853ad97a2', 'sessions': 4798}, {'user': 'a06f3a950a4d20e8dafb1aee', 'sessions': 4878}, {'user': '9aeba07f2c133e01cdc3de97', 'sessions': 5155}, {'user': '82614dc534d555ee4a365901', 'sessions': 5623}, {'user': '8a24231eb598b3e8f082c816', 'sessions': 5892}, {'user': '87cd8b98991c55a2a3c0e93f', 'sessions': 5988}, {'user': 'abfd029cb3f787f0a8f09dae', 'sessions': 8202}, {'user': 'bab02945c9756596a03659d4', 'sessions': 9026}, {'user': '421619f90f941a563deb6eb9', 'sessions': 10346}]


In [38]:
user_goal_data = {}
for user_obj in ten_most_active_users:
    user = user_obj["user"]
    goals = {}
    for session in db[user + "_sessions"].find({"enabled": True, "frequent": {"$exists": True}, "isoWeek": {"$exists": True}}):
        domain = session["domain"]
        if domain not in goals:
            goals[domain] = {"freq": set([]), "infreq": set([]), "total_time": 0}
        goals[domain]["total_time"] += session["duration"]
        goals[domain]["freq" if session["frequent"] else "infreq"].add(session["isoWeek"])
    user_goal_data[user] = goals
print(user_goal_data)

{'4aeda8a5b8cdebecaac7dd65': {'com.GameCoaster.DungeonMaker': {'freq': {32}, 'infreq': set(), 'total_time': 15037}, 'com.android.chrome': {'freq': set(), 'infreq': {32, 33}, 'total_time': 40080}, 'com.google.android.apps.inbox': {'freq': {32, 33}, 'infreq': set(), 'total_time': 2203}, 'com.facebook.katana': {'freq': {32}, 'infreq': {33}, 'total_time': 12069}, 'com.rayark.sdorica': {'freq': set(), 'infreq': {32}, 'total_time': 3911}, 'com.amazon.kindle': {'freq': {32}, 'infreq': {33}, 'total_time': 111}, 'com.instagram.android': {'freq': set(), 'infreq': {32, 33}, 'total_time': 3330}, 'com.google.android.youtube': {'freq': {33}, 'infreq': {32}, 'total_time': 11390}, 'com.ideashower.readitlater.pro': {'freq': {33}, 'infreq': {32}, 'total_time': 2916}, 'com.google.android.apps.books': {'freq': set(), 'infreq': {32}, 'total_time': 31}}, '23cab28916f19d8853ad97a2': {'com.android.chrome': {'freq': {32}, 'infreq': set(), 'total_time': 79}, 'com.sparkslab.dcardreader': {'freq': {32}, 'infreq':

In [45]:
for user in user_goal_data:
    # find most active app that switched from freq to infreq
    goal_data = user_goal_data[user]
    for goal in goal_data:
        most_active_site = ""
        most_active_time = 0
        alternates = len(goal_data[goal]["freq"]) > 0 and len(goal_data[goal]["infreq"]) > 0
        if alternates and goal_data[goal]["total_time"] > most_active_time:
            most_active_time = goal_data[goal]["total_time"]
            most_acive_site = goal
        print(str(goal) + " " + str(goal_data[goal]["total_time"]))
    

com.GameCoaster.DungeonMaker 15037
com.android.chrome 40080
com.google.android.apps.inbox 2203
com.facebook.katana 12069
com.rayark.sdorica 3911
com.amazon.kindle 111
com.instagram.android 3330
com.google.android.youtube 11390
com.ideashower.readitlater.pro 2916
com.google.android.apps.books 31
com.android.chrome 79
com.sparkslab.dcardreader 1170
jp.naver.line.android 3511
com.google.android.gm 1744
com.facebook.orca 1838
com.facebook.katana 8304
com.evernote 12449
com.devhd.feedly 1249
com.google.android.youtube 765
flipboard.app 267
com.machipopo.swag 1321
com.instagram.android 1346
com.facebook.orca 1864
com.facebook.katana 2446
com.android.chrome 790
com.instagram.android 57
com.google.android.youtube 240
com.facebook.katana 12773
com.snapchat.android 14062
com.facebook.orca 11335
com.netflix.mediaclient 1493
com.google.android.youtube 72469
com.twitter.android 63091
com.tencent.mm 65640
jp.naver.line.android 9147
com.tencent.mobileqq 641
com.android.chrome 4941
com.google.android.