In [2]:
# Configuration properties of Apache Spark
#sc.stop()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, from_unixtime

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'

conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [3]:
%load_ext autoreload
%autoreload 2
# load my own functions
from utils.complete_missing_dates import *
from utils.partitions import *

In [4]:
import pyspark.sql.functions as psf
from pyspark.sql import Window
from datetime import datetime, timedelta
from itertools import product

In [5]:
from utils.basic_df_analysis import *

## Create data example

In [6]:
data = (  # recreate the DataFrame
    (1, datetime(2019, 12, 2, 14, 54, 17), 49.94),
    (1, datetime(2019, 11, 3, 8, 58, 39), 50.49),
    (1, datetime(2019, 8, 6, 10, 44, 1), 50.24),
    (2, datetime(2019, 8, 2, 8, 58, 39), 62.32),
     (1, datetime(2019, 11, 3, 8, 58, 39), 50.49),
    (2, datetime(2019, 5, 4, 10, 44, 1), None))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))

In [7]:
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 14:54:17| 49.94|
|     1|2019-11-03 08:58:39| 50.49|
|     1|2019-08-06 10:44:01| 50.24|
|     2|2019-08-02 08:58:39| 62.32|
|     1|2019-11-03 08:58:39| 50.49|
|     2|2019-05-04 10:44:01|  null|
+------+-------------------+------+



In [8]:
count_nulls_by_column(df, ['person', 'weight']).show() ## check, if we have a timestamp we get an error

+------+------+------+------+
|person|weight|person|weight|
+------+------+------+------+
|     2| 62.32|     0|     0|
|     1| 50.24|     0|     0|
|     1| 49.94|     0|     0|
|     1| 50.49|     0|     0|
|     2|  null|     0|     1|
+------+------+------+------+



### check for duplicates

In [9]:
#message with the duplicates and the row with the duplicates and the number of counts
check_duplicates(df, ['person', 'weight'])

Data has duplicates:
+------+------+-----+
|person|weight|count|
+------+------+-----+
|1     |50.49 |2    |
+------+------+-----+



In [10]:
(df.withColumn("cw",sf.when(col('person') == col('weight'),1).otherwise(0)).agg({'cw':'sum'})
 .withColumn('Similarity(%)', 100*col('sum(cw)')/df.count()).select('Similarity(%)').show())

+-------------+
|Similarity(%)|
+-------------+
|          0.0|
+-------------+



In [11]:
df.select('person').subtract(df.select('weight')).show()

+------+
|person|
+------+
|   1.0|
|   2.0|
+------+



In [12]:
df.where(df.person==df.weight).count()

0

In [13]:
import pandas as pd
df1 = pd.DataFrame({'c1': [1, 4, 7], 'c2': [2, 5, 1], 'c3': [3, 1, 1]})
df2 = pd.DataFrame({'c4': [1, 4, 7], 'c2': [3, 5, 2], 'c3': [3, 7, 5]})
set(df1['c2']).intersection(set(df2['c2']))

{2, 5}

In [14]:
df1

Unnamed: 0,c1,c2,c3
0,1,2,3
1,4,5,1
2,7,1,1


In [15]:
df2

Unnamed: 0,c4,c2,c3
0,1,3,3
1,4,5,7
2,7,2,5


In [16]:
df1['c2'].isin(df2['c2']).value_counts()

True     2
False    1
Name: c2, dtype: int64

an idea would be transpose a see if there is duplicated

In [17]:
df1= df
df2= df

In [18]:
from pyspark.sql.functions import array, col, explode, struct, lit
def Transposedf(df, by,colheader):
    # Filter dtypes and split into column names and type description
    cols, dtypes = zip(*((c, t) for (c, t) in df.dtypes if c not in by))
    # Spark SQL supports only homogeneous columns
    assert len(set(dtypes)) == 1, "All columns have to be of the same type"

    # Create and explode an array of (column_name, column_value) structs
    kvs = explode(array([ struct(lit(c).alias("Field"), col(c).alias(colheader)) for c in cols ])).alias("kvs")

    return df.select(by + [kvs]).select(by + ["kvs.Field", "kvs."+colheader])

In [19]:
def Compare_df(df_Expected,df_Actual):
    df_combined = (df_Actual
    .join(df_Expected, ((df_Actual.id == df_Expected.id) 
                        & (df_Actual.Field == df_Expected.Field) 
                        & (df_Actual.Actual_value != df_Expected.Expected_value)))
    .select([df_Actual.account_unique_id,df_Actual.Field,df_Actual.Actual_value,df_Expected.Expected_value])
    )
    return df_combined 

In [21]:
#df11=Transposedf(df1, ["id"],'Actual_value')

In [23]:
#df11=Transposedf(df1, ["id"],'Actual_value')
#df_Expected=Transposedf(df_Expected, ["id"],'Expected_value')

In [24]:
#Compare the expected and actual
#df_result=Compare_df(df_Expected,df_Actual)

In [None]:
#reduce: apply a function to several columns
#df_fixed = (reduce(lambda df, col_name: df.withColumn(col_name, sf.abs(col(col_name))),
#                           _FIXED_COSTS_INPUT,
#                           df_fixed))