In [1]:
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Pyspark Day 4").getOrCreate()
from pyspark.sql import functions as F

In [46]:
df1 = spark.read.csv("emp1.csv",header=True)
df2 = spark.read.csv("emp2.csv",header=True)
df3 = spark.read.csv("emp3.csv",header=True)

In [16]:
df1.show()

+------+---+
| ename|eno|
+------+---+
|  Amar|  1|
| Akash|  2|
|Pankaj|  3|
| Saran|  4|
|Mangal|  5|
+------+---+



In [17]:
df2.show()

+---+------+
|eno| ename|
+---+------+
|  6|Mahesh|
|  7|Rakesh|
|  8|Charan|
|  4| Saran|
|  5|Mangal|
+---+------+



In [18]:
df3.show()

+---+------+
|eno| ename|
+---+------+
|  1|   Ali|
|  2|Thomas|
|  3|  Guru|
|  4| Saran|
|  5|Mangal|
+---+------+



In [45]:
# union between df2 and df3
df2.union(df3).show()

df2.unionAll(df3).show()




+----+------+
|enum| ename|
+----+------+
| 6.7|Mahesh|
|   7|Rakesh|
|   8|Charan|
|   4| Saran|
|   5|Mangal|
|   1|   Ali|
|   2|Thomas|
|   3|  Guru|
|   4| Saran|
|   5|Mangal|
+----+------+

+----+------+
|enum| ename|
+----+------+
| 6.7|Mahesh|
|   7|Rakesh|
|   8|Charan|
|   4| Saran|
|   5|Mangal|
|   1|   Ali|
|   2|Thomas|
|   3|  Guru|
|   4| Saran|
|   5|Mangal|
+----+------+



# Union ( merging 2 dataframes vertically)

In [30]:
# way 1
df1.select('eno','ename').union(df2.select('eno','ename')).distinct().show()

# way 2
df1.unionByName(df2).distinct().show()

# way 2
df2.unionByName(df1).distinct().show()

+---+------+
|eno| ename|
+---+------+
|  1|  Amar|
|  3|Pankaj|
|  2| Akash|
|  5|Mangal|
|  4| Saran|
|  6|Mahesh|
|  8|Charan|
|  7|Rakesh|
+---+------+

+------+---+
| ename|eno|
+------+---+
| Akash|  2|
| Saran|  4|
|Mangal|  5|
|  Amar|  1|
|Pankaj|  3|
|Rakesh|  7|
|Charan|  8|
|Mahesh|  6|
+------+---+

+---+------+
|eno| ename|
+---+------+
|  6|Mahesh|
|  8|Charan|
|  5|Mangal|
|  4| Saran|
|  7|Rakesh|
|  1|  Amar|
|  3|Pankaj|
|  2| Akash|
+---+------+



# Interection

In [39]:
df2.show()

+----+------+
|enum| ename|
+----+------+
|   6|Mahesh|
|   7|Rakesh|
|   8|Charan|
|   4| Saran|
|   5|Mangal|
+----+------+



In [38]:
df3.show()

+---+------+
|eno| ename|
+---+------+
|  1|   Ali|
|  2|Thomas|
|  3|  Guru|
|  4| Saran|
|  5|Mangal|
+---+------+



In [41]:
df2.intersect(df3).show()

+----+------+
|enum| ename|
+----+------+
|   5|Mangal|
|   4| Saran|
+----+------+



In [42]:
df2.intersectAll(df3).show()

+----+------+
|enum| ename|
+----+------+
|   5|Mangal|
|   4| Saran|
+----+------+



In [49]:
df2.show()
df3.show()

+---+------+
|eno| ename|
+---+------+
|  6|Mahesh|
|  7|Rakesh|
|  8|Charan|
|  4| Saran|
|  5|Mangal|
+---+------+

+---+------+
|eno| ename|
+---+------+
|  1|   Ali|
|  2|Thomas|
|  3|  Guru|
|  4| Saran|
|  5|Mangal|
+---+------+



# minus ( except)

In [52]:
df2.exceptAll(df3).show()

+---+------+
|eno| ename|
+---+------+
|  6|Mahesh|
|  8|Charan|
|  7|Rakesh|
+---+------+



# Compare data between pandas and pyspark dataframes

In [70]:
df_pandas = pd.read_csv("emp.csv")
df_pyspark = spark.read.csv("emp.csv",inferSchema=True,header=True)
df_pyspark = df_pyspark.toPandas()
df_pyspark.dtypes


eno        int32
ename     object
salary     int32
deptno     int32
doj       object
dtype: object

In [59]:
import pytest

In [72]:
df_pyspark.dtypes

eno        int32
ename     object
salary     int32
deptno     int32
doj       object
dtype: object

In [73]:
df_pandas.dtypes

eno        int64
ename     object
salary     int64
deptno     int64
doj       object
dtype: object

In [74]:
assert df_pandas.equals(df_pyspark),"data bwteen frames are not matching"

AssertionError: data bwteen frames are not matching

In [75]:
df_expected = spark.read.option("multiline","true").option("inferSchema","true").json("salary.json")

In [76]:
df_expected.show()

+----------+---+------+
|commission|eno|salary|
+----------+---+------+
|      5000|  1| 60000|
|      7000|  2| 70000|
|      3000|  3| 55000|
|      6000|  4| 72000|
|      2000|  5| 48000|
|      4000|  6| 64000|
|      3500|  7| 58000|
|      4500|  8| 62000|
|      8000|  9| 75000|
|      5500| 10| 67000|
|      6200| 11| 73000|
|      NULL| 12| 49000|
|      NULL| 13| 71000|
|      3100| 14| 54000|
|      2900| 15| 59000|
|      3600| 16| 65000|
|      7500| 17| 77000|
|      2500| 18| 52000|
|      3200| 19| 64000|
|      5000| 20| 60000|
+----------+---+------+

