# Analyize Google Chrome history

Code taken and minimally adapted from the [Analyzing Browser History Using Python and Pandas](https://applecrazy.github.io/blog/posts/analyzing-browser-hist-using-python/) blogpost by __AppleCrazy__.

In [None]:
%matplotlib inline

In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import seaborn as sns
sns.set('notebook')

In [None]:
HISTORY_FILE = 'data/history.txt'

assert os.path.exists(HISTORY_FILE), 'History file "{}" does not exist! Please run get_chrome_history.sh'.format(HISTORY_FILE)

In [None]:
def get_history_from_file_as_df(history_file):
    # Open our file
    with open(history_file) as f:
        content = f.readlines()

    # Strip whitespace then split on first occurrence of pipe character
    raw_data = [line.strip().split('|', 1) for line in content]

    data = pd.DataFrame(raw_data, columns=['datetime', 'url']).sort_values('datetime')
    
    # Had an error with this date? Must be an bug when exporting the history since there were no computers in 1601 (I guess?)
    data = data[data.datetime != '1601-01-01 00:00:00']
    parser = lambda u: urlparse(u).netloc
    data['domain'] = data.url.apply(parser)

    data.datetime = pd.to_datetime(data.datetime)
    return data

def get_domain_visit_counts(data):
        # Aggregate domain entries
    site_frequencies = data.domain.value_counts().to_frame()
    # Make the domain a column
    site_frequencies.reset_index(level=0, inplace=True)
    # Rename columns to appropriate names
    site_frequencies.columns = ['domain', 'count']
    return site_frequencies

def plot_domain_visit_counts_as_piechart(site_frequencies, with_labels = True, topN = 20, figsize = (10, 10)):
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_title('Top {} Sites Visited'.format(topN))
    pie_data = site_frequencies['count'].head(topN).tolist()
    pie_labels = None
    # Uncomment to get specific domain names
    
    if with_labels:
        pie_labels = site_frequencies.apply(lambda x: '{} ({})'.format(x.domain, x[]), axis = 1).head(topN)
    else:
        pie_labels = None
    
    ax.pie(pie_data, autopct='%1.1f%%', labels=pie_labels)
    return fig, ax

data = get_history_from_file_as_df(HISTORY_FILE)
site_frequencies = get_domain_visit_counts(data)
fig, ax = plot_domain_visit_counts_as_piechart(site_frequencies)
