In [24]:
import mailbox, pandas as pd, numpy as np 
import matplotlib.pyplot as plt, matplotlib.font_manager as fm
from dateutil.parser import parse as parse_datetime
%matplotlib inline

In [25]:
# load the mbox file
def load_mbox_file(path=""):
    path = path
    mbox = mailbox.mbox(path)
    print('There are {:,} messages in the archive.'.format(len(mbox)))

In [26]:
# get a list of the dates/times of all the messages in the mbox
all_dates = []
all_times = []
def get_list_of_dates_time():
    for message in mbox:  
        try:
            # it's an email and not a chat if there's no label, or if there's a label but it's not 'chat'
            if not 'X-Gmail-Labels' in message or ('X-Gmail-Labels' in message and not 'Chat' in message['X-Gmail-Labels']):
                date, time = str(parse_datetime(message['Date'])).split(' ')
                all_dates.append(date)
                all_times.append(time)
        except (AttributeError, ValueError):
            # hangouts messages have no Date key, so handle exception by skipping them
            pass
    print('There are {:,} messages with dates.'.format(len(all_dates)))

In [27]:
# get the count per date
def get_count():
    date_counts = pd.Series(all_dates).value_counts().sort_index()
    print('There are {:,} dates with messages.'.format(len(date_counts)))
    date_counts.head()

In [28]:
# not every date necessarily has a message, so fill in missing dates in the range with zeros
def fill_missing_dates():
    date_range = pd.date_range(start=min(all_dates), end=max(all_dates), freq='D')
    index = date_range.map(lambda x: str(x.date()))
    date_counts = date_counts.reindex(index, fill_value=0)

    print('There are {:,} dates total in the range, with or without messages.'.format(len(date_counts)))
    date_counts.head()

In [29]:
# create a series of labels for the plot: each new year's day
def xlabels():
    xlabels = pd.Series([label if '01-01' in label else None for label in date_counts.index])
    xlabels = xlabels[pd.notnull(xlabels)]
    xlabels.head()

In [30]:
def plot_graph_day():
    # plot the counts per day
    fig = plt.figure(figsize=[15, 5])
    ax = date_counts.plot(kind='line', linewidth=1, alpha=0.5, color='m')

    ax.set_ylim(bottom=0)
    ax.grid(True, linestyle='--')
    ax.set_xticks(xlabels.index)
    ax.set_xticklabels(xlabels, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
    ax.set_ylabel('Number of emails', fontproperties=label_font)
    ax.set_title('Gmail traffic per day', fontproperties=title_font)

    fig.tight_layout()
    fig.savefig('images/gmail-traffic-day.png', dpi=96)
    plt.show()

In [31]:
# get the count per month
def count_per_month():
    all_months = [x[:-3] for x in all_dates]
    month_counts = pd.Series(all_months).value_counts().sort_index()

In [32]:
# not every month necessarily has a message, so fill in missing months in the range with zeros
def fill_missing_month():
    date_range = pd.date_range(start=min(all_dates), end=max(all_dates), freq='D')
    months_range = date_range.map(lambda x: str(x.date())[:-3])
    index = np.unique(months_range)
    month_counts = month_counts.reindex(index, fill_value=0)

In [33]:
# create a series of labels for the plot: each january
def xlabels_month():
    xlabels = pd.Series([label if '-01' in label else None for label in month_counts.index])
    xlabels = xlabels[pd.notnull(xlabels)]
    xlabels.head()

In [34]:
# plot the counts per month
def plot_graph_month():
    fig = plt.figure(figsize=[10, 5])
    ax = month_counts.plot(kind='line', linewidth=3, alpha=0.5, color='m', marker='o', markeredgecolor='m')

    ax.set_ylim(bottom=0)
    ax.grid(True, linestyle='--')
    ax.set_xticks(xlabels.index)
    ax.set_xticklabels(xlabels, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
    ax.set_ylabel('Number of emails', fontproperties=label_font)
    ax.set_title('Gmail traffic per month', fontproperties=title_font)

    fig.tight_layout()
    fig.savefig('images/gmail-traffic-month.png', dpi=96)
    plt.show()

In [35]:
# get the count per day of the week
def count_per_weekday():
    day_counts = pd.DataFrame()
    day_counts['count'] = date_counts
    day_counts['day_of_week'] = date_counts.index.map(lambda x: parse_datetime(x).weekday())
    mean_day_counts = day_counts.groupby('day_of_week')['count'].mean()
    xlabels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [36]:
def plot_graph_weekday():
    fig = plt.figure(figsize=[7, 5])
    ax = mean_day_counts.plot(kind='bar', width=0.6, alpha=0.5, color='#003399', edgecolor='#333333', zorder=2)

    ax.yaxis.grid(True, linestyle='--')
    ax.set_xticklabels(xlabels, rotation=35, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
    for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)

    ax.set_title('Gmail traffic by day of the week', fontproperties=title_font)
    ax.set_xlabel('')
    ax.set_ylabel('Mean number of emails', fontproperties=label_font)

    fig.tight_layout()
    fig.savefig('images/gmail-traffic-day-week.png', dpi=96)
    plt.show()

In [37]:
# get the count per hour of the day
def count_per_dayhour():
    times = pd.Series(all_times).map(lambda x: '{:02}:00'.format(parse_datetime(x).hour))
    time_counts = times.value_counts().sort_index()
    time_counts.head()

In [38]:
def plot_graph_dayhour():
    fig = plt.figure(figsize=[10, 5])
    ax = time_counts.plot(kind='bar', width=0.8, alpha=0.5, color='#003399', edgecolor='#333333', zorder=2)

    ax.yaxis.grid(True, linestyle='--')
    ax.set_xticklabels(time_counts.index, rotation=45, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
    for label in ax.get_yticklabels():
        label.set_fontproperties(ticks_font)

    ax.set_title('Gmail traffic by hour of the day', fontproperties=title_font)
    ax.set_ylabel('Number of emails', fontproperties=label_font)

    fig.tight_layout()
    fig.savefig('images/gmail-traffic-hour.png', dpi=96)
    plt.show()

In [39]:
# get the count per minute of the day, as hh:mm
def count_per_dayminute():
    minutes = pd.Series(all_times).map(lambda x: '{:02}:{:02}'.format(parse_datetime(x).hour, parse_datetime(x).minute))
    minute_counts = minutes.value_counts().sort_index()

In [40]:
# not every minute necessarily has a message, so fill in missing times with zeros
def fill_missing_time():
    time_range = pd.date_range(start='0:00', end='23:59', freq='1min')
    index = time_range.map(lambda x: '{:02}:{:02}'.format(x.hour, x.minute))
    minute_counts = minute_counts.reindex(index, fill_value=0)

In [41]:
# create a series of labels for the plot: each new hour
def xlabel_minute():
    xlabels = pd.Series([label if ':00' in label else None for label in minute_counts.index])
    xlabels = xlabels[pd.notnull(xlabels)]

In [42]:
# plot the counts per minute
def plot_graph_minute():
    fig = plt.figure(figsize=[15, 5])
    ax = minute_counts.plot(kind='line', linewidth=0.7, alpha=0.7, color='m')

    ax.set_ylim(bottom=0)
    ax.grid(True, linestyle='--')
    ax.set_xticks(xlabels.index)
    ax.set_xticklabels(xlabels, rotation=45, rotation_mode='anchor', ha='right', fontproperties=ticks_font)
    ax.set_ylabel('Number of emails', fontproperties=label_font)
    ax.set_title('Gmail traffic by minute of the day', fontproperties=title_font)

    fig.tight_layout()
    fig.savefig('images/gmail-traffic-minute.png', dpi=96)
    plt.show()