In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
from datetime import timedelta
import tqdm
import pickle
import math

pd.set_option('display.max_columns', None)

In [None]:
event = dd.read_csv('Z:\\event_selected\\*.tsv', sep='\t',
                    dtype={'account_id': str, 'content_section.1': str, 'download_link': str, 'role': str,
                         'get-legal-help': str, 'keywords': str, 'Outgoing_link': str, 'Language': str,
                         'legal_position': str}).set_index('event_id')

In [None]:
touch = pd.read_csv('Z:\\touch_multiple_deduped.tsv', sep='\t',
                    dtype={'utm_medium':str, 'utm_terms':str, 'utm_content':str, 'utm_name':str, 'url_domain':str,
                          'dma_code':str, 'postal_code': str, 'search_terms': str, 'touch_duration.1': 'float64'},
                    index_col='id')

In [None]:
touch.head()

In [None]:
touch['touch_hour'] = pd.DatetimeIndex(touch['touch_date']).hour

In [None]:
touch['business_hours'] = (touch['touch_hour'] > 13) & (touch['touch_hour'] < 22)

In [None]:
touch_selected = touch[['person_id', 'state', 'platform', 'person_identifier_id', 'business_hours']].copy()

In [None]:
touch_selected['is_desktop'] = touch['platform'] == 'DESKTOP'

In [None]:
touch_selected

In [None]:
touch_grouped = touch_selected.groupby('person_id')

In [None]:
touch_busydesk = touch_grouped.mean()

In [None]:
is_professional = (touch_busydesk['business_hours'] > .5) & (touch_busydesk['is_desktop'] > .5)

In [None]:
n_ppl = len(touch_busydesk)

In [None]:
is_professional.sum()/n_ppl

In [None]:
(touch_busydesk['business_hours'] > .5).sum()/n_ppl

In [None]:
(touch_busydesk['is_desktop'] > .5).sum()/n_ppl

In [None]:
professionals = touch_busydesk[is_professional].index.values

In [None]:
professional_touches = touch[touch['person_id'].isin(professionals)]

In [None]:
len(professional_touches)/len(touch)

In [None]:
professional_events = event[event['person_id'].isin(professionals)].compute()

In [None]:
professional_events.to_csv("Z:\\event_professionals.tsv", sep='\t')

In [None]:
len(professional_events)

In [None]:
nonprofessional_events = event[~event['person_id'].isin(professionals)].compute()

In [None]:
nonprofessional_events.to_csv("Z:\\event_nonprofessionals.tsv", sep='\t')

In [None]:
len(nonprofessional_events)

In [None]:
mostly_desktop = (touch_busydesk['is_desktop'] > .5)

In [None]:
desktop_ppl = touch_busydesk[mostly_desktop].index.values

In [None]:
desktop_events = event[event['person_id'].isin(desktop_ppl)].compute()

In [None]:
desktop_events.to_csv("Z:\\event_desktop_ppl.tsv", sep='\t')

In [None]:
len(desktop_events)

In [None]:
mobile_events = event[~event['person_id'].isin(desktop_ppl)].compute()

In [None]:
mobile_events.to_csv("Z:\\event_mobile_ppl.tsv", sep='\t')

In [None]:
len(mobile_events)

In [None]:
touch_busydesk.to_csv("Z:\\people_business_desktop.tsv", sep='\t')