In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
from datetime import timedelta
import tqdm
import pickle

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
touch_headers = pd.read_csv('../data/raw/acquia_samples/scripts-headers/touch_headers.txt', delimiter='\t')
touch_columns = touch_headers.columns

In [None]:
touch_columns

In [None]:
# some columns look like floats at first because they are all NA, though they are strings (utm_...)
# and dma_code looks like an int at first but has NA
# postal_code is sometimes int (US) and sometimes a str (UK) so str is generic
# for these we need to specify the type to read correctly
touch = dd.read_csv('Z:\\touch_fixed', delimiter='\t', encoding='utf-8',
                    dtype={'utm_medium':str, 'utm_terms':str, 'utm_content':str, 'utm_name':str, 'url_domain':str,
                          'dma_code':str, 'postal_code': str, 'search_terms': str, 'touch_duration.1': 'float64'},
                    parse_dates=['touch_date', 'db_last_modified_date', 'db_last_modified_date.1'],
                    names=touch_columns).set_index('id')

In [None]:
len(touch)

In [None]:
touch.columns[20]

In [None]:
touch_person_ids = touch.person_id.unique().compute()

In [None]:
len(touch_person_ids)

In [None]:
len(touch_person_ids.unique())

In [None]:
touch_index = touch.index.compute()
to_delete = pd.Series(index=touch_index, dtype=bool)
dupes = pd.Series(index=touch_index, dtype=object)

In [None]:
len(dupes)

In [None]:
fuzz = pd.Timedelta('00:00:01')
for id_p in tqdm.tqdm(touch_person_ids):
    times = []
    one_id = touch[touch.person_id == id_p].compute()
    for idx, row in one_id.iterrows():
        this_time = row.touch_date
        for prev_idx, t in times:
            if np.abs(t-this_time) < fuzz:
                to_delete.loc[idx] = True
                dupes.at[prev_idx].append(idx)
                break
        if not to_delete.loc[idx]:
            times.append((idx, this_time))
            dupes.at[idx] = [idx]

In [None]:
touch_reduced = touch[['touch_date', 'person_id', 'state']].compute()

In [None]:
touch_reduced

In [None]:
touch_person_ids = touch_reduced.person_id.unique()

In [None]:
len(touch_person_ids)

In [None]:
counts_by_user = touch_reduced.person_id.value_counts()

In [None]:
counts_by_user

In [None]:
counts_of_counts = counts_by_user.value_counts()

In [None]:
plt.plot(counts_of_counts, '.')
plt.yscale('log')

In [None]:
counts_of_counts

In [None]:
appears_more_than_once = (counts_by_user[touch_reduced['person_id']] > 1)

In [None]:
appears_more_than_once.index = touch_reduced.index

In [None]:
touch_reduced_multiple = touch_reduced[appears_more_than_once]

In [None]:
touch_reduced_multiple

In [None]:
touch_person_ids = touch_reduced_multiple.person_id.unique()

In [None]:
len(touch_person_ids)

In [None]:
to_delete = pd.Series(index=touch_reduced_multiple.index, dtype=bool)
dupes = pd.Series(index=touch_reduced_multiple.index, dtype=object)

In [None]:
fuzz = pd.Timedelta('00:00:01')
for id_p in tqdm.tqdm(touch_person_ids):
    times = []
    one_id = touch_reduced_multiple[touch_reduced_multiple.person_id == id_p]
    for idx, row in one_id.iterrows():
        this_time = row.touch_date
        for prev_idx, t in times:
            if np.abs(t-this_time) < fuzz:
                to_delete.loc[idx] = True
                dupes.at[prev_idx].append(idx)
                break
        if not to_delete.loc[idx]:
            times.append((idx, this_time))
            dupes.at[idx] = [idx]

In [None]:
to_delete.sum()

In [None]:
dupes.isna().sum()

In [None]:
dupes.to_csv('dupes.csv')

In [None]:
touch_reduced_deduped = touch_reduced_multiple[~to_delete].copy()

In [None]:
counts_by_user_deduped = touch_reduced_deduped.person_id.value_counts()

In [None]:
appears_more_than_once_deduped = (counts_by_user_deduped[touch_reduced_deduped['person_id']] > 1)
appears_more_than_once_deduped.index = touch_reduced_deduped.index

In [None]:
touch_reduced_deduped = touch_reduced_deduped[appears_more_than_once_deduped].copy()

In [None]:
touch_reduced_deduped

In [None]:
counts_by_user_deduped = touch_reduced_deduped.person_id.value_counts()

In [None]:
people_appear_more_than_once = touch_reduced_deduped.person_id.unique()

In [None]:
with open('person_ids_appear_more_than_once.pkl', 'wb') as file:
    pickle.dump(people_appear_more_than_once, file)

In [None]:
counts_by_user_deduped

In [None]:
counts_of_counts_deduped = counts_by_user_deduped.value_counts()

In [None]:
counts_of_counts_deduped

In [None]:
plt.hist(counts_by_user_deduped)
plt.yscale('log')

In [None]:
touch_reduced_deduped

In [None]:
dupes.to_pickle('../data/processed/dupes_lists.pkl')

In [None]:
appears_more_than_once.to_pickle('../data/processed/touch_person_appears_more_than_once.pkl')

In [None]:
with open('../data/processed/touch_deduped_index_naive.pkl', 'wb') as f:
    pickle.dump(touch_reduced_deduped.index, f)

In [None]:
touch_hour = pd.DatetimeIndex(touch_reduced_deduped['touch_date']).hour

In [None]:
hours = touch_hour.value_counts()

In [None]:
plt.bar(hours.index, hours.values)
plt.title('At what time does ILAO get the most traffic?')
plt.xlabel('Time of day')
plt.tight_layout()
plt.savefig('touches_by_hour.pdf', dpi=500)
plt.savefig('touches_by_hour.png', dpi=500)
plt.show()

In [None]:
n = 10
appears_more_than_n = (counts_by_user_deduped[touch_reduced_deduped['person_id']] > n)
appears_more_than_n.index = touch_reduced_deduped.index

In [None]:
appears_more_than_n.sum()

In [None]:
len(appears_more_than_n)

In [None]:
hours_large_users = touch_hour[appears_more_than_n].value_counts()
hours_small_users = touch_hour[~appears_more_than_n].value_counts()

In [None]:
plt.bar(hours_large_users.index, hours_large_users.values)
plt.title('Traffic from users with > 10 touches')
plt.xlabel('Time of day')
plt.tight_layout()
plt.savefig('touches_by_hour_large_users.pdf', dpi=500)
plt.savefig('touches_by_hour_large_users.png', dpi=500)
plt.show()

In [None]:
plt.bar(hours_small_users.index, hours_small_users.values)
plt.title('Traffic from users with 2-10 touches')
plt.xlabel('Time of day')
plt.tight_layout()
plt.savefig('touches_by_hour_small_users.pdf', dpi=500)
plt.savefig('touches_by_hour_small_users.png', dpi=500)
plt.show()

In [None]:
n = 100
appears_more_than_n = (counts_by_user_deduped[touch_reduced_deduped['person_id']] > n)
appears_more_than_n.index = touch_reduced_deduped.index

In [None]:
appears_more_than_n.sum()

In [None]:
hours_xl_users = touch_hour[appears_more_than_n].value_counts()

In [None]:
plt.bar(hours_xl_users.index, hours_xl_users.values)
plt.title('Traffic from users with > 10 touches')
plt.xlabel('Time of day')
plt.tight_layout()
plt.savefig('touches_by_hour_xl_users.pdf', dpi=500)
plt.savefig('touches_by_hour_xl_users.png', dpi=500)
plt.show()

In [None]:
counts_by_user_deduped[counts_by_user_deduped > 10]

In [None]:
person_identifier = dd.read_csv('Z:\\person_identifier.tsv', delimiter='\t', 
                                names=['person_id', 'customer_id', 'identifier',
                                       'identifier_type', 'person_id_2', 'active', 'last_modified',
                                       'db_last_modified'])

In [None]:
person_has_email = person_identifier[person_identifier['identifier_type']==1].compute()

In [None]:
person_has_email.head()

In [None]:
users_with_email = pd.Series(counts_by_user_deduped[counts_by_user_deduped > 10].index).isin(person_has_email.person_id)

In [None]:
users_with_email

In [None]:
np.sum(users_with_email)

In [None]:
counts_by_user_deduped

In [None]:
touch_reduced_deduped.person_id.max()

In [None]:
touch_reduced_deduped.person_id.min()

In [None]:
person_ids_from_person = person_identifier.person_id.compute()

In [None]:
person_ids_from_person.max()

In [None]:
person_ids_from_person.min()

In [None]:
person_ids_from_person = person_identifier.person_id_2.compute()

In [None]:
person_ids_from_person.max()

In [None]:
person_ids_from_person.min()

In [None]:
users_with_email = pd.Series(counts_by_user_deduped[counts_by_user_deduped > 10].index).isin(person_has_email.person_id_2)

In [None]:
users_with_email

In [None]:
len(person_has_email)

In [None]:
np.sum(users_with_email)

In [None]:
1385/16145

In [None]:
person_identifier.identifier_type.unique().compute()

In [None]:
person_identifier.head()

Some thoughts: next we would like to understand more about these touches, and start understanding relationships between different touches. However for this we need to take a more detailed look at the deduping algorithm, since now it just takes one touch randomly, but we will lose information in this way.

dupes has lists of the duplicate touches, so we can use it to go through the sets and take only the most relevant info.

In [None]:
dupes

In [None]:
touch.head()