In [50]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

def read_csv(file_url):
  return (
    spark
      .read
      .option('header', True)
      .csv(file_url)
  )

file_a = read_csv('gs://artifacts.gcp-cloud-datalab.appspot.com/state-income-a.csv')
file_b = read_csv('gs://artifacts.gcp-cloud-datalab.appspot.com/state-income-b.csv')

In [88]:
import uuid
from pyspark.sql.functions import hash, when, col, lit

def compare(left, right, key, cols='*'):
  if left.columns != right.columns:
    raise AttributeError('The columns are different in the left and right dataframes.')
    
  if not isinstance(cols, list):
    cols = left.columns

  unique_id = str(uuid.uuid1().hex)
  left_hash = unique_id+'_left'
  right_hash = unique_id+'_right'
  comp_result = unique_id+'_result'
  comp_side = unique_id+'_side'
  
  comp_left = (
    left
      .withColumn(left_hash, hash(*cols))
      .select(*key, left_hash)
  )
  
  comp_right = (
    right
      .withColumn(right_hash, hash(*cols))
      .select(*key, right_hash)
  )
  
  comp = (
    comp_left
      .join(comp_right, on=key, how='full_outer')
      .withColumn(comp_result,
                   when(col(left_hash).isNull(), 'ONLY_RIGHT')
                  .when(col(right_hash).isNull(), 'ONLY_LEFT')
                  .when(col(left_hash) != col(right_hash), 'DIFFERENT')
                  .otherwise('IDENTICAL')
                 )
  )
  
  result_left = (
    left
      .join(comp.select(*key, comp_result), on=key, how='inner')
      .withColumn(comp_side, lit('LEFT'))
  )
  
  result_right = (
    right
      .join(comp.select(*key, comp_result), on=key, how='inner')
      .withColumn(comp_side, lit('RIGHT'))
  )
  
  result = (
    result_left
      .union(result_right)
      .withColumnRenamed(comp_result, 'COMPARISON')
      .withColumnRenamed(comp_side, 'SIDE')
  )
  
  return result

In [93]:
result = compare(file_a, file_b, ['Rank', 'State'], cols=['2014'])
result.where(result.COMPARISON == 'DIFFERENT').show()

+----+-----------+------+------+------+------+------+----------+-----+
|Rank|      State|  2014|  2010|  2009|  2007|  2000|COMPARISON| SIDE|
+----+-----------+------+------+------+------+------+----------+-----+
|   4|Connecticut|$65753|$66953|$68460|$81333|$67639| DIFFERENT| LEFT|
|   4|Connecticut|  null|$66953|$68460|$81333|$67639| DIFFERENT|RIGHT|
+----+-----------+------+------+------+------+------+----------+-----+

