# Analyize Google Chrome history

Code taken and minimally adapted from the [Analyzing Browser History Using Python and Pandas](https://applecrazy.github.io/blog/posts/analyzing-browser-hist-using-python/) blogpost by __AppleCrazy__.

In [None]:
%matplotlib inline

In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import seaborn as sns
sns.set('notebook', style = 'white')

In [None]:
HISTORY_FILE = 'data/history.txt'

assert os.path.exists(HISTORY_FILE), 'History file "{}" does not exist! Please run get_chrome_history.sh'.format(HISTORY_FILE)

In [None]:
def get_history_from_file_as_df(history_file):
    # Open our file
    with open(history_file) as f:
        content = f.readlines()

    # Strip whitespace then split on first occurrence of pipe character
    raw_data = [line.strip().split('|', 1) for line in content]

    data = pd.DataFrame(raw_data, columns=['datetime', 'url']).sort_values('datetime')
    
    # Had an error with this date? Must be an bug when exporting the history since there were no computers in 1601 (I guess?)
    data = data[data.datetime != '1601-01-01 00:00:00']
    
    parser = lambda u: urlparse(u).netloc
    data['domain'] = data.url.apply(parser)

    data.datetime = pd.to_datetime(data.datetime)
    return data

def get_domain_visit_counts(data):
        # Aggregate domain entries
    site_frequencies = data.domain.value_counts().to_frame()
    # Make the domain a column
    site_frequencies.reset_index(level=0, inplace=True)
    # Rename columns to appropriate names
    site_frequencies.columns = ['domain', 'count']
    return site_frequencies

def plot_domain_visit_counts_as_piechart(site_frequencies, with_labels = True, topN = 20, figsize = (14, 14)):
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_title('Top {} Sites Visited\n({} visits in total)'.format(topN, site_frequencies['count'].sum()))
    pie_data = site_frequencies['count'].head(topN).tolist()

    if with_labels:
        pie_labels = site_frequencies.apply(lambda x: '{} ({})'.format(x.domain, x['count']), axis = 1).head(topN)
    else:
        pie_labels = None
    
    ax.pie(pie_data, autopct='%1.1f%%', labels=pie_labels)
    return fig, ax

data = get_history_from_file_as_df(HISTORY_FILE)
site_frequencies = get_domain_visit_counts(data)
fig, ax = plot_domain_visit_counts_as_piechart(site_frequencies)

In [None]:
data.datetime.describe().to_frame().T

In [None]:
#data.datetime.apply(lambda x: (x.hour, x.minute)).value_counts()
fig, ax = plt.subplots(figsize = (20, 6))
data.datetime.dt.hour.value_counts().sort_index(ascending = True).plot(kind = 'bar', ax = ax)
ax.set_title('Visits per hour')
ax.set_xlabel('hour of the day');

In [None]:
import collections

day_time = data.apply(lambda x: [x.datetime.hour, x.datetime.minute, x.domain], axis = 1).values
def time_to_x_y(t, factor = np.pi / 12):
    angle_in_rad = (t * factor) % (2 * np.pi)
    x, y = np.sin(angle_in_rad), np.cos(angle_in_rad)
    return x, y

values = collections.defaultdict(lambda: [])
for hour, minute, domain in day_time:
    t = hour + minute / 60
    x, y = time_to_x_y(t)
    values['x'].append(x)
    values['y'].append(y)
    values['domain'].append(domain)

In [None]:
variance = 0.3
new_vals = {}
for key, val in values.items():
    ran = np.random.uniform(low = -variance, high = variance, size=len(val))
    if isinstance(val[0], str):
        new_vals[key] = val
    else:
        new_vals[key] = np.array(val) + ran

colors = plt.get_cmap('Paired').colors
num_colors = len(colors)
cmap_domain_2_idx = {domain: colors[idx % num_colors] for idx, domain in enumerate(set(new_vals['domain']))}
domain_colors = [cmap_domain_2_idx[domain] for domain in new_vals['domain']]
        
fig, ax = plt.subplots(figsize = (10, 10))
pd.DataFrame(new_vals).plot(kind = 'scatter', x = 'x', y = 'y', ax = ax, s = 2, alpha = 0.6, c = domain_colors)
ax.grid('off')
fig.tight_layout()

for pos, spine in ax.spines.items():
    spine.set_visible(False)

ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)

for hour in range(24):
    x, y = time_to_x_y(hour)
    ax.text(x = x, y = y, s = hour, fontdict={'horizontalalignment': 'center', 'weight': 'bold'}, color = 'red')

ax.set_title('Page visits per hour')

fig.tight_layout()
fig.savefig('data/visits_per_hour.png')