In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import tqdm
import pickle
import math

pd.set_option('display.max_columns', None)

In [None]:
mobile = pd.read_csv('Z:\\event_mobile_ppl.tsv', sep='\t',
                    dtype={'account_id': str, 'content_section.1': str, 'download_link': str, 'role': str,
                         'get-legal-help': str, 'keywords': str, 'Outgoing_link': str, 'Language': str,
                         'legal_position': str}, 
                     parse_dates=['event_date'],
                     index_col='event_id')

In [None]:
np.sum(~mobile.content_section.isna())/len(mobile)

In [None]:
mobile_contsect = mobile[~mobile['content_section'].isna()].copy()

In [None]:
event_pairs = mobile_contsect.groupby('person_id').agg({'content_section': lambda x: set(tuple((a, b)) for a in x for b in x if a < b)})

In [None]:
event_pairs

In [None]:
all_pairs = [x for a in event_pairs.content_section for x in list(a)]

In [None]:
from collections import Counter

In [None]:
Counter(mobile_contsect.content_section).most_common()

In [None]:
pair_counts = Counter(all_pairs)

In [None]:
pair_counts.most_common()

In [None]:
mobile_contsect['content_date'] = list(zip(*[mobile_contsect[c] for c in ['content_section', 'event_date']]))

In [None]:
not_procedure = mobile_contsect[mobile_contsect.content_section != 'Procedure']

In [None]:
event_pairs = not_procedure.groupby('person_id').agg({'content_section': lambda x: set(tuple((a, b)) for a in x for b in x if a < b)})
all_pairs = [x for a in event_pairs.content_section for x in list(a)]
pair_counts = Counter(all_pairs)
pair_counts.most_common()

In [None]:
event_date = mobile_contsect.groupby('person_id').agg({'content_date': 
                                              lambda x: set(tuple((a[0], b[0])) for a in x for b in x 
                                                            if ((b[1] - a[1] > pd.Timedelta('24:00:00')) and
                                                               (a[0] != b[0])))})

In [None]:
pair_causal_counts = Counter([x for a in event_date.content_date for x in list(a)])

In [None]:
pair_causal_counts.most_common()

In [None]:
event_date = not_procedure.groupby('person_id').agg({'content_date': 
                                              lambda x: set(tuple((a[0], b[0])) for a in x for b in x 
                                                            if ((b[1] - a[1] > pd.Timedelta('24:00:00')) and
                                                               (a[0] != b[0])))})

In [None]:
pair_causal_counts = Counter([x for a in event_date.content_date for x in list(a)])
pair_causal_counts.most_common()

In [None]:
event_date = not_procedure.groupby('person_id').agg({'content_date': 
                                              lambda x: set(tuple((a[0], b[0])) for a in x for b in x 
                                                            if ((b[1] - a[1] > 7*pd.Timedelta('24:00:00')) and
                                                               (a[0] != b[0])))})

In [None]:
pair_causal_counts = Counter([x for a in event_date.content_date for x in list(a)])
pair_causal_counts.most_common()

In [None]:
mobile_counts_7d = pd.DataFrame.from_dict(pair_causal_counts, orient='index')

In [None]:
mobile_counts_7d.columns = ['counts_mobile']

In [None]:
business = pd.read_csv('Z:\\event_desktop_business_ppl.tsv', sep='\t',
                    dtype={'account_id': str, 'content_section.1': str, 'download_link': str, 'role': str,
                         'get-legal-help': str, 'keywords': str, 'Outgoing_link': str, 'Language': str,
                         'legal_position': str}, 
                     parse_dates=['event_date'],
                     index_col='event_id')

In [None]:
business_contsect = business[~business['content_section'].isna()].copy()
len(business_contsect)/len(business)

In [None]:
Counter(business_contsect.content_section).most_common()

In [None]:
event_pairs = business_contsect.groupby('person_id').agg({'content_section': 
                                                          lambda x: set(tuple((a, b)) for a in x for b in x if a < b)})
all_pairs = [x for a in event_pairs.content_section for x in list(a)]
pair_counts = Counter(all_pairs)
pair_counts.most_common()

In [None]:
business_contsect['content_date'] = list(zip(*[business_contsect[c] for c in ['content_section', 'event_date']]))

In [None]:
business_not_procedure = business_contsect[business_contsect.content_section != 'Procedure']

In [None]:
event_pairs = business_not_procedure.groupby('person_id').agg({'content_section': 
                                                          lambda x: set(tuple((a, b)) for a in x for b in x if a < b)})
all_pairs = [x for a in event_pairs.content_section for x in list(a)]
pair_counts = Counter(all_pairs)
pair_counts.most_common()

In [None]:
event_date = business_not_procedure.groupby('person_id').agg({'content_date': 
                                              lambda x: set(tuple((a[0], b[0])) for a in x for b in x 
                                                            if ((b[1] - a[1] > pd.Timedelta('24:00:00')) and
                                                               (a[0] != b[0])))})
pair_causal_counts = Counter([x for a in event_date.content_date for x in list(a)])
pair_causal_counts.most_common()

In [None]:
event_date = business_not_procedure.groupby('person_id').agg({'content_date': 
                                              lambda x: set(tuple((a[0], b[0])) for a in x for b in x 
                                                            if ((b[1] - a[1] > 7*pd.Timedelta('24:00:00')) and
                                                               (a[0] != b[0])))})
pair_causal_counts = Counter([x for a in event_date.content_date for x in list(a)])
pair_causal_counts.most_common()

In [None]:
business_counts_7d = pd.DataFrame.from_dict(pair_causal_counts, orient='index')

In [None]:
business_counts_7d.columns = ['counts_business']

In [None]:
business_counts_7d

In [None]:
mobile_counts_7d

In [None]:
counts = mobile_counts_7d.join(business_counts_7d, how='inner')

In [None]:
counts['mobile_ratio'] = counts['counts_mobile']/counts.counts_mobile.sum()

In [None]:
counts['business_ratio'] = counts['counts_business']/counts.counts_business.sum()

In [None]:
counts['diff'] = counts['mobile_ratio'] - counts['business_ratio']

In [None]:
counts.sort_values(by='diff')

In [None]:
counts.to_csv('Z:\\diff_counts.tsv', sep='\t')