In [0]:
# Copyright 2019, The TensorFlow Federated Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import collections
import os
import re
import time
import sys
from IPython import display

if not sys.version_info >= (3, 0):
  sys.stdout.write("This notebook requires Python 3.x\n")
  sys.exit(1)

# Set this to the directory where logs from your experiments reside.
LOG_DIR="/tmp/sc_paper/logs"

def set_matplotlib_params(font_size=8, params=None):
  """Resets matplotlib defaults to nicer defaults, esp. for publications.

  Args:
    font_size: Default font size.
    params: A dict of other params to update, e.g. {'figure.figsize': [7, 5]}.
  """
  matplotlib.rcdefaults()

  # Define our own defaults:
  default_params = {
      'axes.labelsize': font_size,
      'axes.titlesize': font_size,
      'font.size': font_size,
      'legend.fontsize': font_size,
      'xtick.labelsize': font_size,
      'ytick.labelsize': font_size,
      'font.family': 'sans-serif',
      'pdf.fonttype': 42,  # Avoid Type 3 fonts in publication plots
      'ps.fonttype': 42,  # Avoid Type 3 fonts in publication plots
      'legend.frameon': False,
  }
  if params:
    default_params.update(params)
  matplotlib.rcParams.update(default_params)

set_matplotlib_params(10,
    {'legend.handlelength': 3.0,
     'figure.figsize': [6, 4],  
     'lines.linewidth': 1.5,
    })

def get_mean_and_std(x): # x is a list of lists
  """Computes mean and std vectors from matrices.
  
  Args:
    x: a list of lists of numbers. Inner lists must have same lengths.
  
  Returns:
    A dictionary with entries 'mean' and 'std', containing lists of the mean
    values and standard deviations, computed along axis 0.
  """  
  return {'mean': np.mean(x, axis=0).tolist(), 'std': np.std(x, axis=0).tolist()}


In [0]:
def find_key_val(lines, key, num_results=1):
  """Get "key=value" values from a list of lines.
  
  Args:
    lines: a list of strings containing key=value pairs
    key: a string key to search for in key=value expressions
    num_results: Expected number of results. This function fails if less or more
      lines starting with "key=" are found
  """
  lines = [l for l in lines if l.startswith(key)]    
  assert(len(lines) == num_results)
  if num_results == 1:
    return lines[0].lstrip(key)
  else:
    return [l.lstrip(prefix) for l  in lines]

def parse_logfile(f, results, abort_function):
  """Parses a log file from an experiment run.
  
  Args:
    f: log file path
    results: existing dictionary where results will be stored. See below for
      details on the contents of this dictionary.
    abort_function: function that takes the log file contents as list of lines
      and returns true if this file - based on the contents - should be ignored.
      This can be used to filter out files that do not match certain criteria,
      e.g. log files that do not contain a certain configuration such as
      "bias=0.5".
      
  Returns:
    A new entry in results, stored in results[replica], where replica is the
    replica number extraced from the log file. Any configuration is run
    num_replica times, and the i-th such replica will store "replica=i" in the
    log file. The contents are log file dependent:
      
    results[replica][learning_rate][mode]
    for mode==iid:
      results[learning_rate][mode]['avg'] = [...]
      results[learning_rate][mode]['raw'][test_group] = [...]
    for mode==sep:
      results[learning_rate][mode]['avg'] = [...]
      results[learning_rate][mode]['raw'][train_group][test_group] = [...]
    for mode==sc:
      results[learning_rate][mode]['sc']: [...] average over matrix
      results[learning_rate][mode]['pl']: [...] average over matrix diagonal
      results[learning_rate][mode]['raw'][train_group][test_group] = [...]
  """
      
  def get_losses(lines, num_groups, num_examples_per_day_per_group):
    """Extracts losses and corresponding #examples from logs. Ignore epochs."""
    num_examples=[[] for i in range(0, num_groups)]
    losses=[[] for i in range(0, num_groups)]
    r = r'^day (\d+), group (\d+): trained on (\d+) examples, loss=([\d.]+)$'
    for l in lines:
      m = re.search(r, l)
      if m:
        day = int(m.group(1))
        group = int(m.group(2))
        num_examples[group].append(int(day)*num_examples_per_day_per_group+int(m.group(3)))
        losses[group].append(float(m.group(3)))
    return num_examples, losses

  def get_iid_accuracies_old(lines, num_groups, num_days, num_examples_per_day):
    accuracies={}
    r = r'iid\d+: day (\d+), group (\d+): num_train_examples (\d+) \(dt=\d+s\): num correct: \d+/\d+ \(([\d.]+)\)$'
    for l in lines:
      m=re.search(r, l)
      if m:
        day = int(m.group(1))
        test_group = int(m.group(2))
        accuracy = float(m.group(4))
        if not test_group in accuracies:
          accuracies[test_group] = {} # using dict, not list w/ append, so we are invariant to order in file.
        accuracies[test_group][day] = accuracy        
    avg_accuracies = [np.mean([accuracies[g][d] for g in range(num_groups)]) for d in range(num_days)]
    res = {'avg': avg_accuracies}
    res['raw'] = {}
    for g in range(num_groups):
      res['raw'][g] = [accuracies[g][d] for d in range(num_days)]
    return res

  def get_iid_accuracies_new(lines, num_groups, num_days, num_examples_per_day):
    num_examples_per_day_per_group = num_examples_per_day / num_days
    accuracies={}
    r = r'iid (\d+) on (\d+): day (\d+), group (\d+): num_train_examples (\d+) \(dt=\d+s\): num correct: \d+/\d+ \(([\d.]+)\)$'
    for l in lines:
      m=re.search(r, l)
      if m:
        train_group = int(m.group(1))
        test_group = int(m.group(2))
        day = int(m.group(3))
        assert test_group == int(m.group(4))
        accuracy = float(m.group(6))
        if not train_group in accuracies:
          accuracies[train_group] = {}
        if not test_group in accuracies[train_group]:
          accuracies[train_group][test_group] = {}
        accuracies[train_group][test_group][day] = accuracy
    avg_accuracies = [np.mean([accuracies[trg][tsg][d] for trg in range(num_groups) for tsg in range(num_groups)]) for d in range(num_days)]
    res = {'avg': avg_accuracies}
    res['raw']={}
    for trg in accuracies:
      res['raw'][trg] = {}
      for tsg in accuracies[trg]:
        res['raw'][trg][tsg] = [accuracies[trg][tsg][d] for d in accuracies[trg][tsg]]
    return res
    
  def get_sc_accuracies(lines, num_groups, num_days, num_examples_per_day):    
    num_examples_per_day_per_group = num_examples_per_day / num_days
    accuracies={}
    r = r'sc (\d+) on (\d+): day (\d+), group (\d+): num_train_examples (\d+) \(dt=\d+s\): num correct: \d+/\d+ \(([\d.]+)\)$'
    for l in lines:
      m=re.search(r, l)
      if m:
        train_group = int(m.group(1))
        test_group = int(m.group(2))
        day = int(m.group(3))
        assert test_group == int(m.group(4))
        accuracy = float(m.group(6))
        if not train_group in accuracies:
          accuracies[train_group] = {}
        if not test_group in accuracies[train_group]:
          accuracies[train_group][test_group] = {}
        accuracies[train_group][test_group][day] = accuracy
    res = {'raw': {}}
    for trg in accuracies:
      res['raw'][trg] = {}
      for tsg in accuracies[trg]:
        res['raw'][trg][tsg] = [accuracies[trg][tsg][d] for d in accuracies[trg][tsg]]
    res['sc'] = [np.mean([accuracies[trg][tsg][d] for trg in range(num_groups) for tsg in range(num_groups)]) for d in range(num_days)]
    res['pl'] = [np.mean([accuracies[g][g][d] for g in range(num_groups)]) for d in range(num_days)]
    return res
  
  def get_sep_accuracies(lines, num_groups, num_days, num_examples_per_day):    
    num_examples_per_day_per_group = num_examples_per_day / num_days
    accuracies={}
    r = r'sep (\d+) on (\d+): day (\d+), group (\d+): num_train_examples (\d+) \(dt=\d+s\): num correct: \d+/\d+ \(([\d.]+)\)$'
    for l in lines:
      m=re.search(r, l)
      if m:
        train_group = int(m.group(1))
        test_group = int(m.group(2))
        day = int(m.group(3))
        assert test_group == int(m.group(4))
        accuracy = float(m.group(6))
        if not train_group in accuracies:
          accuracies[train_group] = {}
        if not test_group in accuracies[train_group]:
          accuracies[train_group][test_group] = {}
        accuracies[train_group][test_group][day] = accuracy        
    res = {'raw': {}}
    for trg in accuracies:
      res['raw'][trg] = {}
      for tsg in accuracies[trg]:
        res['raw'][trg][tsg] = [accuracies[trg][tsg][d] for d in accuracies[trg][tsg]]
    res['avg'] = [np.mean([accuracies[g][g][d] for g in range(num_groups)]) for d in range(num_days)]
    return res

  # Open the log file, check for integrity, and extract the configuration from
  # this run (learning rate, replica number, etc.).
  with open(f) as f:
    lines = [l.rstrip('\n') for l in f]
  assert lines[-1] == 'END_MARKER'
  if abort_function(lines):
    return
  learning_rate=float(find_key_val(lines, 'lr='))
  vocab_size=int(find_key_val(lines, 'vocab_size='))
  mode=find_key_val(lines, 'mode=')
  num_examples_per_day=int(find_key_val(lines, 'num_train_examples_per_day='))
  num_days=int(find_key_val(lines, 'num_days='))
  num_groups=int(find_key_val(lines, 'num_groups='))
  replica=int(find_key_val(lines, 'replica='))
  batch_size=int(find_key_val(lines, 'batch_size='))

  # Extract the results from this run, depending on what mode was used.
  if not replica in results:
    results[replica] = {}
  if not learning_rate in results[replica]:
    results[replica][learning_rate] = {}
  if mode == 'iid':
    if len([l for l in lines if l.startswith('iid ')]) == 0:
      results[replica][learning_rate][mode] = get_iid_accuracies_old(lines, num_groups, num_days, num_examples_per_day)
    else:
      results[replica][learning_rate][mode] = get_iid_accuracies_new(lines, num_groups, num_days, num_examples_per_day)
  elif mode == 'sc':
    results[replica][learning_rate][mode] = get_sc_accuracies(lines, num_groups, num_days, num_examples_per_day)
  elif mode == 'sep':
    results[replica][learning_rate][mode] = get_sep_accuracies(lines, num_groups, num_days, num_examples_per_day)
  else:
    raise ValueError('unknown mode %s' % mode)


In [0]:
# Parse log files, plot results.
files = [f for f in os.listdir(LOG_DIR) if f.endswith('.log')]
# Only use the results from runs that use a data bias of 0.5.
for bias in [0.5]:
  r = {}
  for i, f in enumerate(files):    
    parse_logfile(os.path.join(LOG_DIR, f), r, lambda lines: float(find_key_val(lines, 'bias='))!=bias)
  days = range(1, len(r[0][list(r[0].keys())[0]]['iid']['avg']) + 1)

  # Holds results with average and std values from across replicas.
  ravg = {}
  for lr in sorted(r[0]):
    ravg[lr] = {}
    ravg[lr]['iid'] = get_mean_and_std([r[rep][lr]['iid']['avg'] for rep in r])
    ravg[lr]['sc'] = {}
    ravg[lr]['sc']['sc'] = get_mean_and_std([r[rep][lr]['sc']['sc'] for rep in r])
    ravg[lr]['sc']['pl'] = get_mean_and_std([r[rep][lr]['sc']['pl'] for rep in r])
    ravg[lr]['sep'] = get_mean_and_std([r[rep][lr]['sep']['avg'] for rep in r])
    num_groups = len(r[0][lr]['sc']['raw'])
    num_days = len(r[0][lr]['sc']['raw'][0][0])
    # plot test accuracy as a function of days, for the four different modes.
    plt.figure(figsize=(5,3))
    plt.title('learning_rate: %f' % lr)
    plt.errorbar(days, ravg[lr]['iid']['mean'], ravg[lr]['iid']['std'], label='i.i.d. baseline')
    plt.errorbar(days, ravg[lr]['sc']['sc']['mean'], ravg[lr]['sc']['sc']['std'], label='consensus model')
    plt.errorbar(days, ravg[lr]['sc']['pl']['mean'], ravg[lr]['sc']['pl']['std'], label='pluralistic, 1 SGD chain')
    plt.errorbar(days, ravg[lr]['sep']['mean'], ravg[lr]['sep']['std'], label='pluralistic, %d SGD chains' % num_groups)    
    plt.legend(loc='lower right')
    plt.ylim([0.6, 0.8])
    plt.xlabel('day')
    plt.ylabel('test accuracy')
    # Plot test accuracy for the single SGD chain model on two blocks of test
    # data.
    plt.figure(figsize=(5, 3))
    for t in [0, num_groups//2]:
      cycles = get_mean_and_std([[r[rep][lr]['sc']['raw'][trg][t][d] for d in range(num_days) for trg in range(num_groups)] for rep in r])
      plt.errorbar(np.arange(1, num_days+1, step=1/float(num_groups)), cycles['mean'], cycles['std'], label="test on %d:00h block" % (t * (24//num_groups)))
      plt.legend(loc='best')
      plt.xlim([1,num_days])
      plt.xlabel('day')
      plt.ylim([0.45, 0.8])
      plt.ylabel('test accuracy')

plt.show()

In [0]:
# Plot results again (don't re-parse log files) as used in paper. Mostly same
# as above, but
# - also save plots as pdfs
# - modify one plot to include a different learning rate (see comment below and
#   in paper).
# - layout (skip figure title, change figure size)
for bias in [0.5]:
  days = range(1, len(r[0][list(r[0].keys())[0]]['iid']['avg']) + 1)

  # now compute mean + std for every accuracy trace.
  ravg = {}
  # For the comparison of convergence, we want to use LR=0.464 for idealized,
  # block cyclic consensus, and pluralistic single chain SGD; but for per-component
  # SGD, we want to plot LR=1.0, because that LR works better there (due to fewer
  # examples being processed). So we find
  for lr in sorted(r[0], reverse=True):
    ravg[lr] = {}
    ravg[lr]['iid'] = get_mean_and_std([r[rep][lr]['iid']['avg'] for rep in r])
    ravg[lr]['sc'] = {}
    ravg[lr]['sc']['sc'] = get_mean_and_std([r[rep][lr]['sc']['sc'] for rep in r])
    ravg[lr]['sc']['pl'] = get_mean_and_std([r[rep][lr]['sc']['pl'] for rep in r])
    ravg[lr]['sep'] = get_mean_and_std([r[rep][lr]['sep']['avg'] for rep in r])
    num_groups = len(r[0][lr]['sc']['raw'])
    num_days = len(r[0][lr]['sc']['raw'][0][0])
    # test accuracy for different modes.
    plt.figure(figsize=(5,2.8))
    plt.errorbar(days, ravg[lr]['iid']['mean'], ravg[lr]['iid']['std'], label='idealized i.i.d. SGD')
    plt.errorbar(days, ravg[lr]['sc']['sc']['mean'], ravg[lr]['sc']['sc']['std'], label='block-cyclic consensus SGD')
    plt.errorbar(days, ravg[lr]['sc']['pl']['mean'], ravg[lr]['sc']['pl']['std'], label='pluralistic single chain SGD')
    print(lr)
    # In the plot for LR=0.464, use the per-component SGD results for LR=1.0 instead
    # for a fairer comparison because that LR yielded better results there.
    if lr == 0.464:
      plt.errorbar(days, ravg[1.0]['sep']['mean'], ravg[1.0]['sep']['std'], label='per-component SGD')    
    else:      
      plt.errorbar(days, ravg[lr]['sep']['mean'], ravg[lr]['sep']['std'], label='per-component SGD')    
    plt.legend(loc='lower right', numpoints=1, fontsize=9)
    plt.ylim([0.6, 0.78])
    plt.xlim([1, num_days])
    plt.xlabel('day')
    plt.ylabel('test accuracy')
    with open(os.path.join(LOG_DIR, "test_lr=%f.pdf" % lr), 'wb') as fout:
      plt.savefig(fout, format="pdf", bbox_inches='tight', transparent=True)
    # plot cycles: from SC, how model at different times performs on same test data    
    plt.figure(figsize=(5, 2))
    for t in [0, num_groups//2]:
      cycles = get_mean_and_std([[r[rep][lr]['sc']['raw'][trg][t][d] for d in range(num_days) for trg in range(num_groups)] for rep in r])
      plt.errorbar(np.arange(1, num_days+1, step=1/float(num_groups)), cycles['mean'], cycles['std'], label="test on %d:00h block" % (t * (24//num_groups)))
      plt.legend(loc='best', numpoints=1, fontsize=9)
      plt.xlim([1,num_days])
      plt.xlabel('day')
      plt.ylim([0.44, 0.82])
      plt.ylabel('test accuracy')      
    with open(os.path.join(LOG_DIR, "cycles_lr=%f.pdf" % lr), 'wb') as fout:
      plt.savefig(fout, format="pdf", bbox_inches='tight', transparent=True)

plt.show()

In [0]:
# Show some stats about the original training data (before we preprocess
# it by adding bias and shuffling).
import csv
import random
import re
from dateutil.parser import parse
import datetime

path = '/tmp/sc_paper/raw_data/training.1600000.processed.noemoticon.csv'


def split_line(text):
  return re.findall(r"[\w']+|[.,!?;]", text)

with open(path, errors='ignore') as csv_file:
  csv_reader = csv.reader(csv_file, delimiter=',')
  lines = []
  print('reading CSV file')
  unigrams = {}
  i = 0
  for row in csv_reader:
    if row[0] == '0':
      row[0] = 0
    elif row[0] == '4':
      row[0] = 1
    else:
      raise ValueError('Invalid label: %s' % row[0])
    row[2] = datetime.datetime.strptime(row[2], '%a %b %d %H:%M:%S %Z %Y')
    lines.append(row)
    for w in split_line(row[5]):
      if w in unigrams:
        unigrams[w] = unigrams[w] + 1
      else:
        unigrams[w] = 1
    i = i + 1
    if i % 100000 == 0:
      print('read %d rows' % i)

# Plot sentiment as time of day
hours_pos = [row[2].hour for row in lines if row[0] == 1]
hours_neg = [row[2].hour for row in lines if row[0] == 0]
h_p, b_p = np.histogram(hours_pos, bins=24, range=[0,24])
h_n, b_n = np.histogram(hours_neg, bins=24, range=[0,24])
plt.figure(figsize=(5, 2.5))
plt.plot(b_p[:-1], h_p, '.-', label='positive sentiment')
plt.plot(b_n[:-1], h_n, '.-', label='negative sentiment')
plt.legend(loc='best', frameon=True)
plt.xlabel('time of day')
plt.ylabel('# posts')
plt.xlim([0, 23])
plt.ylim(bottom=0)
plt.show()


In [0]:
# Plot stats about the data before and after adding bias, split into 6 groups
# for every day; this is what is depicted in the Appendix of the paper.
num_groups = 6
# Plot sentiment as time of day
hours_pos = [row[2].hour for row in lines if row[0] == 1]
hours_neg = [row[2].hour for row in lines if row[0] == 0]
h_p, b_p = np.histogram(hours_pos, bins=num_groups, range=[0,24])
h_n, b_n = np.histogram(hours_neg, bins=num_groups, range=[0,24])
plt.figure(figsize=(5, 2.0))
for g in range(num_groups):
  plt.plot([b_p[g], b_p[g]], [0, max(h_p)], '--', color='gray')
xshift = (24/num_groups/2)
plt.plot(b_p[:-1]+xshift, h_p, 'b.-', label='raw, positive sentiment')
plt.plot(b_n[:-1]+xshift, h_n, 'g.-', label='raw, negative sentiment')

# Plot sentiment with added bias. This formula is taken from the experiment
# script because I don't persist the biased data (bias is added at run time for
# added flexibility).
bias = 0.5
biases = np.interp(range(num_groups // 2 + 1), [0, num_groups / 2], [-bias, bias]).tolist()
biases.extend(biases[-2:0:-1])
h_n_biased =[0.]*num_groups
h_p_biased =[0.]*num_groups

for g in range(num_groups):
  b = biases[g]
  print(b)
  if b < 0:
    # drop b*100 % negative examples
    h_n_biased[g] = h_n[g] * (1-abs(b))
    h_p_biased[g] = h_p[g]
  else:
    h_p_biased[g] = h_p[g] * (1-abs(b))
    h_n_biased[g] = h_n[g]

plt.plot(b_p[:-1]+xshift, h_p_biased, 'b.--', label='modified, positive')
plt.plot(b_n[:-1]+xshift, h_n_biased, 'g.--', label='modified, negative')
plt.legend(loc='lower center', ncol=2, fontsize = 8, frameon=True)
plt.xlabel('time of day')
plt.ylabel('# posts')
ticks = range(0, 24, 24//num_groups)
plt.xticks(ticks, [str(t) + ":00" for t in ticks])
plt.xlim([0, 24])
plt.ylim(bottom=-10000)
with open('/tmp/sentiment140_bias.pdf', 'wb') as f:
  plt.savefig(f, format="pdf", bbox_inches='tight', transparent=True)
plt.show()