In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
from datetime import timedelta
import tqdm
import pickle
import math

pd.set_option('display.max_columns', None)

In [None]:
touch_headers = pd.read_csv('../data/raw/acquia_samples/scripts-headers/touch_headers.txt', delimiter='\t')
touch_columns = touch_headers.columns

In [None]:
# some columns look like floats at first because they are all NA, though they are strings (utm_...)
# and dma_code looks like an int at first but has NA
# postal_code is sometimes int (US) and sometimes a str (UK) so str is generic
# for these we need to specify the type to read correctly
touch = dd.read_csv('Z:\\touch_fixed.tsv', delimiter='\t', encoding='utf-8',
                    dtype={'utm_medium':str, 'utm_terms':str, 'utm_content':str, 'utm_name':str, 'url_domain':str,
                          'dma_code':str, 'postal_code': str, 'search_terms': str, 'touch_duration.1': 'float64'},
                    parse_dates=['touch_date', 'db_last_modified_date', 'db_last_modified_date.1'],
                    names=touch_columns).set_index('id')

In [None]:
appears_more_than_once = pd.read_pickle('../data/processed/touch_person_appears_more_than_once.pkl')

In [None]:
appears_more_than_once.index

In [None]:
touch.index

In [None]:
touch_multiple = touch[appears_more_than_once]

In [None]:
len(touch_multiple)

In [None]:
touch_multiple = touch_multiple.drop(['custom_field'+str(i) for i in range(1,21)], axis=1)

In [None]:
with open('../data/processed/dupes_lists.pkl', 'rb') as f:
    dupes_lists = pickle.load(f)

In [None]:
dupes_lists

In [None]:
a_repeat = touch_multiple.loc[[3881904223, 3881904227]].compute()

In [None]:
a_repeat

In [None]:
print(a_repeat['number_of_page_views']

In [None]:
a_repeat['number_of_page_views'].idxmax()

In [None]:
len(dupes_lists)

In [None]:
ex = 0
for i, l in enumerate(dupes_lists):
    try:
        if len(l) > 2:
            ex += 1
            print(touch_multiple.loc[l].compute())
            if ex == 10:
                break
    except TypeError:  # elements of dupes_list are either lists or nan, so we need this hacky structure
        continue

In [None]:
all_the_dupes = []
for i, l in enumerate(dupes_lists):
    try:
        if len(l) > 2:
            all_the_dupes += l
    except TypeError:  # elements of dupes_list are either lists or nan, so we need this hacky structure
        continue

In [None]:
id_deduped = pd.Series(index=dupes_lists.index, dtype='int64')

In [None]:
ex = 0
for l in tqdm.tqdm(dupes_lists):
    try:
        for n in l:
            id_deduped.loc[n] = l[0]
    except TypeError:  # elements of dupes_list are either lists or nan, so we need this hacky structure
        continue

In [None]:
touch_multiple['id_deduped_prelim'] = id_deduped 

In [None]:
a_repeat.groupby('customer_id').agg({'number_of_page_views': pd.idxmax})

In [None]:
a_repeat.groupby('id_deduped_prelim').idxmax()['number_of_page_views']

In [None]:
a_repeat = a_repeat.sort_values('number_of_page_views', ascending=False)

In [None]:
a_repeat

In [None]:
aggregation_dict = {col: 'first' for col in a_repeat.columns}
for field in ['touch_duration', 'touch_duration.1', 'number_of_page_views', 'touch_duration_in_seconds']:
    aggregation_dict[field] = max

a_repeat.groupby('id_deduped_prelim').agg(aggregation_dict)

In [None]:
touch_multiple.head()

In [None]:
touch_deduped = touch_multiple.groupby('id_deduped_prelim').agg(aggregation_dict)

In [None]:
touch_deduped['id'] = touch_multiple.groupby('id_deduped_prelim').idxmax()['number_of_page_views']

In [None]:
touch_deduped.to_csv('Z:\\touch_deduped.tsv', sep='\t')

In [None]:
touch_deduped_pd = touch_deduped.compute()

In [None]:
touch_deduped_pd

In [None]:
counts_by_user_deduped = touch_deduped_pd.person_id.value_counts()
appears_more_than_once_deduped = (counts_by_user_deduped[touch_deduped_pd['person_id']] > 1)
appears_more_than_once_deduped.index = touch_deduped_pd.index
touch_deduped_pd = touch_deduped_pd[appears_more_than_once_deduped].copy()
touch_deduped_pd

In [None]:
touch_deduped_pd.set_index('id', inplace=True)
touch_deduped_pd.drop('id_deduped_prelim', inplace=True)

In [None]:
touch_deduped_pd.drop('id_deduped_prelim', axis=1, inplace=True)

In [None]:
touch_deduped_pd.to_csv('Z:\\touch_multiple_deduped.tsv', sep='\t')

In [None]:
touch_deduped_pd

In [None]:
touch_deduped_pd.loc[3881904223]

In [None]:
touch_deduped_pd.loc[2502639089]

In [None]:
right_ids = touch_multiple.groupby('id_deduped_prelim').idxmax()['number_of_page_views'].compute()

In [None]:
right_ids

In [None]:
touch_multiple.loc[3881872905].compute()

In [None]:
touch_multiple.loc[3881872901].compute()

The id selecting code is somehow not always working :/

In [None]:
touch_page_views = touch_multiple[['number_of_page_views']].compute()

In [None]:
id_deduped

In [None]:
right_ids_again = pd.Series(index=id_deduped.unique(), dtype='int64')

In [None]:
right_ids_again

In [None]:
for l in tqdm.tqdm(dupes_lists):
    try:
        id_ded = l[0]
        id_max = touch_page_views.loc[l].idxmax().values[0]
        right_ids_again.loc[id_ded] = id_max
    except TypeError:  # elements of dupes_list are either lists or nan, so we need this hacky structure
        continue

In [None]:
right_ids_again[right_ids_again != right_ids]

In [None]:
replace_dict = {right_ids[right_ids_again != right_ids].values[i]: new_val for i, new_val in 
 enumerate(right_ids_again[right_ids_again != right_ids].values)}

In [None]:
replace_dict

In [None]:
touch_deduped_pd.rename(index=replace_dict, inplace=True)

In [None]:
touch_deduped_pd

In [None]:
touch_deduped_pd.drop('new_id', axis=1, inplace=True)

In [None]:
touch_deduped_pd.to_csv('Z:\\touch_multiple_deduped.tsv', sep='\t')

In [None]:
appears_more_than_once = pd.read_pickle('../data/processed/person_ids_appear_more_than_once.pkl')

In [None]:
np.savetxt('../data/processed/person_ids_appear_more_than_once.csv', appears_more_than_once)