## Spark SQL

### Block diagram - Double Click to expand
<!--    
+----------------------------------------------+
|                  User Program                |
+----------------------------------------------+
                        |
                        v
+----------------+    +-----------------------+
|    Data Source  |    |      SparkSession      |
+----------------+    +-----------------------+
                        |
                        v
+----------------+    +-----------------------+
|  JDBC/ODBC API |    | Spark SQL Engine / API |
+----------------+    +-----------------------+
                        |
                        v
+----------------+    +-----------------------+
|     Dataset    |    |    DataFrame / SQL     |
+----------------+    +-----------------------+
                        |
                        v
+----------------+    +-----------------------+
|    Catalyst     |    |  Spark Core / Cluster  |
|    Optimizer    |    +-----------------------+
+----------------+               |
                                 v
+----------------------------------------------+
|                    RDDs                      |
+----------------------------------------------+

 -->

In [1]:
!pip install findspark
import findspark
findspark.init()



In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('SparkSQL_UseCase').master('local[2]').getOrCreate()

In [4]:
spark

In [5]:
rangeDF=spark.range(100).toDF('number')

In [6]:
rangeDF

DataFrame[number: bigint]

In [7]:
rangeDF.show(4)

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
+------+
only showing top 4 rows



In [8]:
rangeDF.count()

100

In [9]:
evenDF=rangeDF.where('number%2==0')

In [10]:
evenDF

DataFrame[number: bigint]

In [11]:
evenDF.show(5)

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+
only showing top 5 rows



In [12]:
# spark.createDataFrame function

nameDF=spark.createDataFrame([[1,'Alice',30,'Female'],
                              [2,'Beneth',30,'Male'],
                              [3,'Charlie',30,'Male'],
                              [4,'Dharan',30,'Male']],['Id','Name','Age','Gender'])

In [13]:
nameDF.show(3)

+---+-------+---+------+
| Id|   Name|Age|Gender|
+---+-------+---+------+
|  1|  Alice| 30|Female|
|  2| Beneth| 30|  Male|
|  3|Charlie| 30|  Male|
+---+-------+---+------+
only showing top 3 rows



In [14]:
# constructing DataFrame from RDD

sc=spark.sparkContext

In [15]:
sc

In [16]:
# infer schema by using RDD

import os
os.getcwd()

'C:\\Users\\pdharantej\\OneDrive - ALLEGIS GROUP\\Desktop\\TEK_Training\\5. Data Analysis'

In [17]:
tempRDD=sc.textFile('./temp_data.txt')

In [18]:
tempRDD.count()

13131

In [19]:
type(tempRDD)

pyspark.rdd.RDD

In [20]:
tempRDD.take(3)

['1901\t-78\t1', '1901\t-72\t1', '1901\t-94\t1']

In [21]:
splitRDD=tempRDD.map(lambda record: record.split('\t'))
splitRDD.take(3)

[['1901', '-78', '1'], ['1901', '-72', '1'], ['1901', '-94', '1']]

In [22]:
# constructing the RDD using the Row object

from pyspark.sql import Row

In [23]:
schemaRDD=splitRDD.map(lambda line: Row(year=line[0],temp=int(line[1]),status=int(line[2])))

In [24]:
schemaRDD.take(3)

[Row(year='1901', temp=-78, status=1),
 Row(year='1901', temp=-72, status=1),
 Row(year='1901', temp=-94, status=1)]

In [25]:
tempDF=spark.createDataFrame(schemaRDD)
tempDF.show(3)

+----+----+------+
|year|temp|status|
+----+----+------+
|1901| -78|     1|
|1901| -72|     1|
|1901| -94|     1|
+----+----+------+
only showing top 3 rows



In [26]:
tempDF.head(3)

[Row(year='1901', temp=-78, status=1),
 Row(year='1901', temp=-72, status=1),
 Row(year='1901', temp=-94, status=1)]

In [27]:
tempDF.printSchema()

root
 |-- year: string (nullable = true)
 |-- temp: long (nullable = true)
 |-- status: long (nullable = true)



In [28]:
# reading a csv file as an RDD and then building the RDD as a dataframe

# # read test.csv as RDD and convert it to dataframe
testRDD=sc.textFile('./test.csv')
testRDD.count()

233600

In [29]:
testRDD.take(3)

['User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3',
 '1000004,P00128942,M,46-50,7,B,2,1,1,11,',
 '1000009,P00113442,M,26-35,17,C,0,0,3,5,']

In [30]:
header=testRDD.first()

In [31]:
testRDD=testRDD.filter(lambda line: line!=header)

In [32]:
print('After the header record is removed ')
testRDD.first()

After the header record is removed 


'1000004,P00128942,M,46-50,7,B,2,1,1,11,'

In [33]:
# split data based on the separator
splitRDD=testRDD.map(lambda line: line.split(','))
print('After splitting the records are : \n')
splitRDD.take(2)

After splitting the records are : 



[['1000004', 'P00128942', 'M', '46-50', '7', 'B', '2', '1', '1', '11', ''],
 ['1000009', 'P00113442', 'M', '26-35', '17', 'C', '0', '0', '3', '5', '']]

In [34]:
from pyspark.sql.types import *
testRDDSchema=StructType([
    StructField('User_Id', StringType(),True),
    StructField('ProductId', StringType(),True),
    StructField('Gender', StringType(),True),
    StructField('Age', StringType(),True),
    StructField('Occupation', StringType(),True),
    StructField('City_Category', StringType(),True),
    StructField('Stay_In_Current_City_Years', StringType(),True),
    StructField('Marital_Status', StringType(),True),
    StructField('Product_Category_1', StringType(),True),
    StructField('Product_Category_2', StringType(),True),
    StructField('Product_Category_3', StringType(),True),
])

In [35]:
testDF=spark.createDataFrame(data=splitRDD,schema=testRDDSchema)

In [36]:
testDF.show(2)

+-------+---------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+
|User_Id|ProductId|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|
+-------+---------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+
|1000004|P00128942|     M|46-50|         7|            B|                         2|             1|                 1|                11|                  |
|1000009|P00113442|     M|26-35|        17|            C|                         0|             0|                 3|                 5|                  |
+-------+---------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+
only showing top 2 rows



In [37]:
testDF.count()

233599

In [38]:
testSample=testDF.sample(False,0.1,98) # <without_duplication, sample_percentage, seed_value>

In [39]:
testSample.count()

23403

In [40]:
trainDF=spark.read.format('csv').option('header','true').option('inferSchema','true').load('./train.csv')

In [41]:
trainDF

DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int]

In [42]:
trainDF.show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
|1000001| P00087842|     F| 0-17|        10|            A|                         2|             0|                12|              null|              null|    1422

In [43]:
trainSample=trainDF.sample(False, 0.1, 192)
trainSample.count()

54729

In [44]:
trainSamplePD=trainSample.toPandas()
print(type(trainSample))
print(type(trainSamplePD))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [45]:
# trainSamplePD = trainSamplePD.set_index('User_ID')

In [46]:
trainSamplePD.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000005,P00031342,M,26-35,20,A,1,1,8,,,6073
1,1000006,P00231342,F,51-55,9,A,1,0,5,8.0,14.0,5378
2,1000006,P00190242,F,51-55,9,A,1,0,4,5.0,,2079
3,1000008,P00220442,M,26-35,12,C,4+,1,5,14.0,,8584
4,1000010,P00297942,F,36-45,1,B,4+,1,8,,,5875


In [47]:
trainSamplePD.to_csv('./2023_train_sample.csv')

In [48]:
trainDF.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)



In [49]:
trainDF.head(2)

[Row(User_ID=1000001, Product_ID='P00069042', Gender='F', Age='0-17', Occupation=10, City_Category='A', Stay_In_Current_City_Years='2', Marital_Status=0, Product_Category_1=3, Product_Category_2=None, Product_Category_3=None, Purchase=8370),
 Row(User_ID=1000001, Product_ID='P00248942', Gender='F', Age='0-17', Occupation=10, City_Category='A', Stay_In_Current_City_Years='2', Marital_Status=0, Product_Category_1=1, Product_Category_2=6, Product_Category_3=14, Purchase=15200)]

In [50]:
trainDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
only

In [51]:
print('Number of records in Training Dataset {}'.format(trainDF.count()))
print('Number of records in Testing Dataset {}'.format(testDF.count()))

Number of records in Training Dataset 550068
Number of records in Testing Dataset 233599


In [52]:
trainDF.describe().show()

+-------+------------------+----------+------+------+-----------------+-------------+--------------------------+-------------------+------------------+------------------+------------------+-----------------+
|summary|           User_ID|Product_ID|Gender|   Age|       Occupation|City_Category|Stay_In_Current_City_Years|     Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|         Purchase|
+-------+------------------+----------+------+------+-----------------+-------------+--------------------------+-------------------+------------------+------------------+------------------+-----------------+
|  count|            550068|    550068|550068|550068|           550068|       550068|                    550068|             550068|            550068|            376430|            166821|           550068|
|   mean|1003028.8424013031|      null|  null|  null|8.076706879876669|         null|         1.468494139793958|0.40965298835780306| 5.404270017525106| 9.84232925112238

In [53]:
testDF.describe().show()

+-------+------------------+---------+------+------+-----------------+-------------+--------------------------+------------------+------------------+------------------+------------------+
|summary|           User_Id|ProductId|Gender|   Age|       Occupation|City_Category|Stay_In_Current_City_Years|    Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|
+-------+------------------+---------+------+------+-----------------+-------------+--------------------------+------------------+------------------+------------------+------------------+
|  count|            233599|   233599|233599|233599|           233599|       233599|                    233599|            233599|            233599|            233599|            233599|
|   mean|1003029.3568594044|     null|  null|  null|8.085407043694536|         null|        1.4682778997642345|0.4100702485883929| 5.276542279718663| 9.849586059346997|12.669453946534905|
| stddev|  1726.50496799554|     null|  null|  null|6.521146

In [54]:
trainDF.describe('Purchase').show()

+-------+-----------------+
|summary|         Purchase|
+-------+-----------------+
|  count|           550068|
|   mean|9263.968712959126|
| stddev|5023.065393820575|
|    min|               12|
|    max|            23961|
+-------+-----------------+



In [55]:
trainDF.createOrReplaceTempView('trainDFTable')

In [56]:
trainDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
only

In [57]:
spark.sql("select * from trainDFTable limit 2")

DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int]

In [58]:
spark.sql("select * from trainDFTable limit 2").show()

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+



In [59]:
dataFrameWay=trainDF.groupBy('Age').count()
dataFrameWay

DataFrame[Age: string, count: bigint]

In [60]:
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Age#183], functions=[count(1)])
   +- Exchange hashpartitioning(Age#183, 200), ENSURE_REQUIREMENTS, [plan_id=485]
      +- HashAggregate(keys=[Age#183], functions=[partial_count(1)])
         +- FileScan csv [Age#183] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/pdharantej/OneDrive - ALLEGIS GROUP/Desktop/TEK_Trainin..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Age:string>




In [61]:
sqlWay=spark.sql('select Age, count(1) from trainDFTable group by Age')

In [62]:
sqlWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Age#183], functions=[count(1)])
   +- Exchange hashpartitioning(Age#183, 200), ENSURE_REQUIREMENTS, [plan_id=498]
      +- HashAggregate(keys=[Age#183], functions=[partial_count(1)])
         +- FileScan csv [Age#183] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/pdharantej/OneDrive - ALLEGIS GROUP/Desktop/TEK_Trainin..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Age:string>




In [63]:
# The above two statements give the same output 

In [64]:
from pyspark.sql.functions import expr, col, column
dfWay=trainDF.filter(col('Age')!='0-17').groupBy('Age').count()

In [65]:
dfWay.show()

+-----+------+
|  Age| count|
+-----+------+
|18-25| 99660|
|26-35|219587|
|46-50| 45701|
|51-55| 38501|
|36-45|110013|
|  55+| 21504|
+-----+------+



In [66]:
sqlWay=spark.sql('select Age from trainDFTable')

In [67]:
trainDF.select(expr('User_Id as userID'),col('User_ID'),'User_ID').show(3)

+-------+-------+-------+
| userID|User_ID|User_ID|
+-------+-------+-------+
|1000001|1000001|1000001|
|1000001|1000001|1000001|
|1000001|1000001|1000001|
+-------+-------+-------+
only showing top 3 rows



In [68]:
trainDF.select(expr('User_ID as user')).show(2)

+-------+
|   user|
+-------+
|1000001|
|1000001|
+-------+
only showing top 2 rows



In [69]:
spark.sql('select User_ID as userID from trainDFTable').show(2)

+-------+
| userID|
+-------+
|1000001|
|1000001|
+-------+
only showing top 2 rows



In [70]:
trainDF.selectExpr('User_ID as userId', 'product_ID as productID').show(2)

+-------+---------+
| userId|productID|
+-------+---------+
|1000001|P00069042|
|1000001|P00248942|
+-------+---------+
only showing top 2 rows



In [71]:
trainDF.select('User_ID', 'product_ID', 'Age').show()

+-------+----------+-----+
|User_ID|product_ID|  Age|
+-------+----------+-----+
|1000001| P00069042| 0-17|
|1000001| P00248942| 0-17|
|1000001| P00087842| 0-17|
|1000001| P00085442| 0-17|
|1000002| P00285442|  55+|
|1000003| P00193542|26-35|
|1000004| P00184942|46-50|
|1000004| P00346142|46-50|
|1000004|  P0097242|46-50|
|1000005| P00274942|26-35|
|1000005| P00251242|26-35|
|1000005| P00014542|26-35|
|1000005| P00031342|26-35|
|1000005| P00145042|26-35|
|1000006| P00231342|51-55|
|1000006| P00190242|51-55|
|1000006|  P0096642|51-55|
|1000006| P00058442|51-55|
|1000007| P00036842|36-45|
|1000008| P00249542|26-35|
+-------+----------+-----+
only showing top 20 rows



In [72]:
from pyspark.sql.functions import lit
trainDF.select('*', lit(1).alias('ConstantOne')).show(3)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [73]:
# pyspark datastructures are immutable 

In [74]:
spark.sql('select *, 1 as ConstantOne from trainDFTable limit 3').show()

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [75]:
trainDF=trainDF.withColumn('ConstantOne', lit(1))

In [76]:
trainDF.show(3)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [77]:
trainDF.withColumn('OccuopationOne', trainDF.Occupation+1).show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+--------------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|OccuopationOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+--------------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|            11|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|            11|
|1000001| P00087842|     F| 0-

In [78]:
tempDF=trainDF.withColumn('SameCategoryCode',
                          trainDF['Product_Category_1']==trainDF['Product_Category_2'])
tempDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|SameCategoryCode|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|            null|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|           false|
+-------+----------+-----

In [79]:
tempDF.filter(col('SameCategoryCode')==False).show(3)

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|SameCategoryCode|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|           false|
|1000001| P00085442|     F| 0-17|        10|            A|                         2|             0|                12|                14|              null|    1057|          1|           false|
|1000003| P00193542|

In [80]:
tempDF.withColumnRenamed('SameCategoryCode', 'SimilarCategoryCode').show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+-------------------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|SimilarCategoryCode|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+-------------------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|               null|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|              false|
|1000

In [81]:
tempDF.drop('SameCategoryCode').show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F| 0-17|        10|            A|                         2|             0|     

In [82]:
tempDF.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)
 |-- ConstantOne: integer (nullable = false)
 |-- SameCategoryCode: boolean (nullable = true)



In [83]:
tempDF.withColumn('Purchase', col('Purchase').cast('String')).printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: string (nullable = true)
 |-- ConstantOne: integer (nullable = false)
 |-- SameCategoryCode: boolean (nullable = true)



In [84]:
trainDF.select('Product_ID').distinct().count()

3631

In [85]:
testDF=testDF.withColumn('Product_ID', col('ProductID'))

In [86]:
testDF.select('Product_ID').distinct().count()

3491

In [87]:
diff_cat_in_test_train=testDF.select('Product_ID').subtract(trainDF.select('Product_ID'))
diff_cat_in_test_train.count()

46

In [88]:
diff_cat_in_train_test=trainDF.select('Product_ID').subtract(testDF.select('Product_ID'))
diff_cat_in_train_test.count()

186

In [89]:
trainDF.crosstab('Age', 'Gender').show()

+----------+-----+------+
|Age_Gender|    F|     M|
+----------+-----+------+
|      0-17| 5083| 10019|
|     46-50|13199| 32502|
|     18-25|24628| 75032|
|     36-45|27170| 82843|
|       55+| 5083| 16421|
|     51-55| 9894| 28607|
|     26-35|50752|168835|
+----------+-----+------+



In [90]:
trainDF.crosstab('Gender', 'Age').show()

+----------+-----+-----+------+-----+-----+-----+-----+
|Gender_Age| 0-17|18-25| 26-35|36-45|46-50|51-55|  55+|
+----------+-----+-----+------+-----+-----+-----+-----+
|         F| 5083|24628| 50752|27170|13199| 9894| 5083|
|         M|10019|75032|168835|82843|32502|28607|16421|
+----------+-----+-----+------+-----+-----+-----+-----+



In [91]:
trainDF.groupBy('Age', 'Gender').count().show()

+-----+------+------+
|  Age|Gender| count|
+-----+------+------+
|51-55|     F|  9894|
|18-25|     M| 75032|
| 0-17|     F|  5083|
|46-50|     M| 32502|
|18-25|     F| 24628|
|  55+|     M| 16421|
|  55+|     F|  5083|
|36-45|     M| 82843|
|26-35|     F| 50752|
| 0-17|     M| 10019|
|36-45|     F| 27170|
|51-55|     M| 28607|
|26-35|     M|168835|
|46-50|     F| 13199|
+-----+------+------+



In [92]:
trainDF.groupBy('Gender', 'Age').count().show()

+------+-----+------+
|Gender|  Age| count|
+------+-----+------+
|     F|46-50| 13199|
|     M| 0-17| 10019|
|     M|26-35|168835|
|     M|51-55| 28607|
|     M|18-25| 75032|
|     M|  55+| 16421|
|     F|51-55|  9894|
|     F|36-45| 27170|
|     F|18-25| 24628|
|     F|  55+|  5083|
|     M|36-45| 82843|
|     F| 0-17|  5083|
|     M|46-50| 32502|
|     F|26-35| 50752|
+------+-----+------+



In [93]:
spark.sql('''
    select age, 
    sum(case when gender='F' then 1 else 0 end) as Female,
    sum(case when gender='M' then 1 else 0 end) as Male
    from trainDFTable
    group by age
''').show()

+-----+------+------+
|  age|Female|  Male|
+-----+------+------+
|18-25| 24628| 75032|
|26-35| 50752|168835|
| 0-17|  5083| 10019|
|46-50| 13199| 32502|
|51-55|  9894| 28607|
|36-45| 27170| 82843|
|  55+|  5083| 16421|
+-----+------+------+



In [94]:
trainDF.select('Age', 'Gender').dropDuplicates().show(20)

+-----+------+
|  Age|Gender|
+-----+------+
|51-55|     F|
|18-25|     M|
| 0-17|     F|
|46-50|     M|
|18-25|     F|
|  55+|     M|
|  55+|     F|
|36-45|     M|
|26-35|     F|
| 0-17|     M|
|36-45|     F|
|51-55|     M|
|26-35|     M|
|46-50|     F|
+-----+------+



In [95]:
trainDF.dropna().count()

166821

In [96]:
trainDF.na.drop('any').count()
# any column has the null values, the whole row gets to drop

166821

In [97]:
trainDF.na.drop().count()

166821

In [98]:
trainDF.fillna(-1).show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|                -1|                -1|    8370|          1|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F| 0-17|        10|            A|                         2|             0|     

In [99]:
trainDF.fillna().show(2)

TypeError: fillna() missing 1 required positional argument: 'value'

In [100]:
fill_col_values={
    "Gender": 'M',
    'Purchase': 9999999,
    'Product_Category_3': -1
}
trainDF.na.fill(fill_col_values).show(3)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|                -1|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [101]:
trainDF.na.replace([''], ['UNKNOWN'], ['Gender']).count()

550068

In [102]:
print('Purchase amount greater than 15000 {}')

Purchase amount greater than 15000 {}


In [103]:
trainDF.where('Purchase > 15000').where('Gender = "F"').count()

21429

In [104]:
trainDF.filter('Purchase > 15000').where('Gender = "F"').count()

21429

In [105]:
trainDF.where((col('Purchase') > 15000) & (col('Gender')=='F')).count()

21429

In [106]:
trainDF.filter((col('Purchase') > 15000) & (col('Gender')=='F')).count()

21429

In [107]:
spark.sql('select * from trainDFTable where Purchase > 15000 and Gender = "F"').count()

21429

In [108]:
from pyspark.sql.functions import countDistinct
trainDF.select(countDistinct('Age').alias('DISTINCT_Age')).show()

+------------+
|DISTINCT_Age|
+------------+
|           7|
+------------+



In [109]:
trainDF.agg(countDistinct('Age').alias('DISTINCT_Age')).show()

+------------+
|DISTINCT_Age|
+------------+
|           7|
+------------+



In [110]:
from pyspark.sql import functions as func

In [111]:
trainDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+-----------

In [112]:
trainDF.agg(func.min('Age'), func.avg('Purchase')).show()

+--------+-----------------+
|min(Age)|    avg(Purchase)|
+--------+-----------------+
|    0-17|9263.968712959126|
+--------+-----------------+



In [113]:
trainDF.groupBy('Age').count().show()

+-----+------+
|  Age| count|
+-----+------+
|18-25| 99660|
|26-35|219587|
| 0-17| 15102|
|46-50| 45701|
|51-55| 38501|
|36-45|110013|
|  55+| 21504|
+-----+------+



In [114]:
from pyspark.sql.functions import approx_count_distinct

In [115]:
?approx_count_distinct

In [116]:
trainDF.select(approx_count_distinct('Age',0.1)).show()

+--------------------------+
|approx_count_distinct(Age)|
+--------------------------+
|                         7|
+--------------------------+



In [117]:
from pyspark.sql.functions import first, last
trainDF.select(first('Product_ID',True),last('Product_ID',True)).show()

+-----------------+----------------+
|first(Product_ID)|last(Product_ID)|
+-----------------+----------------+
|        P00069042|       P00371644|
+-----------------+----------------+



In [118]:
from pyspark.sql.functions import min, max
trainDF.select(min('Purchase'),max('Purchase')).show()

+-------------+-------------+
|min(Purchase)|max(Purchase)|
+-------------+-------------+
|           12|        23961|
+-------------+-------------+



In [119]:
from pyspark.sql.functions import sum, sum_distinct

In [120]:
trainDF.select(sum('Purchase')).show()

+-------------+
|sum(Purchase)|
+-------------+
|   5095812742|
+-------------+



In [121]:
trainDF.select(sum_distinct('Purchase')).show()

+----------------------+
|sum(DISTINCT Purchase)|
+----------------------+
|             208520914|
+----------------------+



In [122]:
from pyspark.sql.functions import sum, count, avg, expr
trainDF.select(
    count('Purchase').alias('total_transactions'),
    sum('Purchase').alias('total_purchases'),
    avg('Purchase').alias('avg_purchases'),
    expr('mean(Purchase)').alias('mean_purchases')
).selectExpr(
    'total_purchases/total_transactions', 'avg_purchases', 'mean_purchases'
).show()

+--------------------------------------+-----------------+-----------------+
|(total_purchases / total_transactions)|    avg_purchases|   mean_purchases|
+--------------------------------------+-----------------+-----------------+
|                     9263.968712959126|9263.968712959126|9263.968712959126|
+--------------------------------------+-----------------+-----------------+



In [123]:
from pyspark.sql.functions import var_pop, var_samp, stddev_pop, stddev_samp
trainDF.select(var_pop('Purchase'), var_samp('Purchase'), stddev_pop('Purchase'), stddev_samp('Purchase')).show()

+--------------------+-------------------+--------------------+---------------------+
|   var_pop(Purchase)| var_samp(Purchase)|stddev_pop(Purchase)|stddev_samp(Purchase)|
+--------------------+-------------------+--------------------+---------------------+
|2.5231140081385408E7|2.523118595059785E7|   5023.060827959921|    5023.065393820575|
+--------------------+-------------------+--------------------+---------------------+



In [124]:
from pyspark.sql.functions import collect_list, collect_set
trainDF.agg(collect_set('Age'), collect_list('Age')).show()

+--------------------+--------------------+
|    collect_set(Age)|   collect_list(Age)|
+--------------------+--------------------+
|[55+, 51-55, 0-17...|[0-17, 0-17, 0-17...|
+--------------------+--------------------+



In [125]:
spark.sql('select collect_set(Age), collect_list(Age) from trainDFTable').show()

+--------------------+--------------------+
|    collect_set(Age)|   collect_list(Age)|
+--------------------+--------------------+
|[55+, 51-55, 0-17...|[0-17, 0-17, 0-17...|
+--------------------+--------------------+



In [126]:
trainDF.groupBy('Age').agg(count('Purchase').alias('quan'),
                          expr('count(Purchase)')).show()

+-----+------+---------------+
|  Age|  quan|count(Purchase)|
+-----+------+---------------+
|18-25| 99660|          99660|
|26-35|219587|         219587|
| 0-17| 15102|          15102|
|46-50| 45701|          45701|
|51-55| 38501|          38501|
|36-45|110013|         110013|
|  55+| 21504|          21504|
+-----+------+---------------+



In [127]:
trainDF.groupBy('Age').agg({'Purchase':'mean'}).show()

+-----+-----------------+
|  Age|    avg(Purchase)|
+-----+-----------------+
|18-25|9169.663606261289|
|26-35|9252.690632869888|
| 0-17|8933.464640444974|
|46-50|9208.625697468327|
|51-55|9534.808030960236|
|36-45|9331.350694917874|
|  55+|9336.280459449405|
+-----+-----------------+



In [128]:
trainDF.groupBy('Age').agg({'Purchase':'sum'}).show()

+-----+-------------+
|  Age|sum(Purchase)|
+-----+-------------+
|18-25|    913848675|
|26-35|   2031770578|
| 0-17|    134913183|
|46-50|    420843403|
|51-55|    367099644|
|36-45|   1026569884|
|  55+|    200767375|
+-----+-------------+



In [129]:
exprs={x: 'sum' for x in trainDF.columns}
trainDF.groupBy('Age').agg(exprs).show()

+-----+------------------+-----------------------+-------------------+-------------+------------+---------------+-------------------------------+-----------------------+--------+----------------+-----------+-----------------------+---------------+
|  Age|sum(City_Category)|sum(Product_Category_3)|sum(Marital_Status)|sum(Purchase)|sum(User_ID)|sum(Occupation)|sum(Stay_In_Current_City_Years)|sum(Product_Category_1)|sum(Age)|sum(ConstantOne)|sum(Gender)|sum(Product_Category_2)|sum(Product_ID)|
+-----+------------------+-----------------------+-------------------+-------------+------------+---------------+-------------------------------+-----------------------+--------+----------------+-----------+-----------------------+---------------+
|18-25|              null|                 388041|              21116|    913848675| 99939196632|         671348|                       116997.0|                 509371|    null|           99660|       null|                 654936|           null|
|26-35| 

In [130]:
### Joins

person=spark.createDataFrame(
    [
        (0,'Sundar Pichai',0,[250,100]),
        (1,'Sergery',1,[500,250,100]),
        (2,'William Oak',2,[100])
    ]
).toDF('id', 'name', 'graduate_program', 'role_status')


In [131]:
person

DataFrame[id: bigint, name: string, graduate_program: bigint, role_status: array<bigint>]

In [132]:
person.show()

+---+-------------+----------------+---------------+
| id|         name|graduate_program|    role_status|
+---+-------------+----------------+---------------+
|  0|Sundar Pichai|               0|     [250, 100]|
|  1|      Sergery|               1|[500, 250, 100]|
|  2|  William Oak|               2|          [100]|
+---+-------------+----------------+---------------+



In [133]:
graduateProgram = spark.createDataFrame(
    [
        (0, 'MBA', 'School of MBA', 'Penn State University'),
        (1, 'Ph.D', 'Computer Science', 'Stanford University'),
        (2, 'Ph.D', 'School of Information', 'Oklhama University')
    ]
).toDF('id', 'degree', 'dept', 'school')

In [134]:
graduateProgram

DataFrame[id: bigint, degree: string, dept: string, school: string]

In [135]:
graduateProgram.show(truncate=False)

+---+------+---------------------+---------------------+
|id |degree|dept                 |school               |
+---+------+---------------------+---------------------+
|0  |MBA   |School of MBA        |Penn State University|
|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |Ph.D  |School of Information|Oklhama University   |
+---+------+---------------------+---------------------+



In [136]:
roleStatus=spark.createDataFrame(
    [
        (500, 'President'),
        (250, 'Founder'),
        (100, 'Director')
    ]
).toDF('id', 'status')

In [137]:
roleStatus

DataFrame[id: bigint, status: string]

In [138]:
roleStatus.show(truncate=False)

+---+---------+
|id |status   |
+---+---------+
|500|President|
|250|Founder  |
|100|Director |
+---+---------+



In [139]:
# tempView is for one spark session
# globalView is when you have multiple spark sessions
person.createOrReplaceTempView('personTbl')
graduateProgram.createOrReplaceTempView('graduateProgramTbl')
roleStatus.createOrReplaceTempView('roleStatusTbl')

In [140]:
joinExpression=person['graduate_program'] == graduateProgram['id']
person.join(graduateProgram, joinExpression).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [141]:
spark.sql('''
    select * from personTbl join graduateProgramTbl
    on personTbl.graduate_program=graduateProgramTbl.id
''').show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [142]:
joinType='inner'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [143]:
joinType='outer'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [144]:
joinType='left_outer'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [145]:
joinType='right_outer'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [146]:
from pyspark.sql.functions import expr
person.withColumnRenamed('id', 'personId').join(roleStatus, expr('array_contains(role_status, id)')).show(truncate=False)

+--------+-------------+----------------+---------------+---+---------+
|personId|name         |graduate_program|role_status    |id |status   |
+--------+-------------+----------------+---------------+---+---------+
|0       |Sundar Pichai|0               |[250, 100]     |250|Founder  |
|0       |Sundar Pichai|0               |[250, 100]     |100|Director |
|1       |Sergery      |1               |[500, 250, 100]|500|President|
|1       |Sergery      |1               |[500, 250, 100]|250|Founder  |
|1       |Sergery      |1               |[500, 250, 100]|100|Director |
|2       |William Oak  |2               |[100]          |100|Director |
+--------+-------------+----------------+---------------+---+---------+



In [147]:
spark.sql('''
    select * from 
    (select id as personId, name, graduate_program, role_status from personTbl)
    inner join roleStatusTbl on array_contains(role_status, id)
''').show(truncate=False)

+--------+-------------+----------------+---------------+---+---------+
|personId|name         |graduate_program|role_status    |id |status   |
+--------+-------------+----------------+---------------+---+---------+
|0       |Sundar Pichai|0               |[250, 100]     |250|Founder  |
|0       |Sundar Pichai|0               |[250, 100]     |100|Director |
|1       |Sergery      |1               |[500, 250, 100]|500|President|
|1       |Sergery      |1               |[500, 250, 100]|250|Founder  |
|1       |Sergery      |1               |[500, 250, 100]|100|Director |
|2       |William Oak  |2               |[100]          |100|Director |
+--------+-------------+----------------+---------------+---+---------+



In [148]:
trainDF.count()

550068

In [149]:
sample1=trainDF.sample(False, 0.1, 1234)
sample2=trainDF.sample(False, 0.1, 2345)

In [150]:
sample1.count()

55488

In [151]:
sample2.count()

54712

In [152]:
splitDF=trainDF.randomSplit([0.7, 0.3], seed=8787)

In [153]:
type(splitDF)

list

In [154]:
splitDF

[DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int, ConstantOne: int],
 DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int, ConstantOne: int]]

In [155]:
trainDF.select('User_ID').rdd.map(lambda x: (x, 1)).take(5)

[(Row(User_ID=1000001), 1),
 (Row(User_ID=1000001), 1),
 (Row(User_ID=1000001), 1),
 (Row(User_ID=1000001), 1),
 (Row(User_ID=1000002), 1)]

In [156]:
trainDF.orderBy(trainDF.Purchase.desc()).show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1002272| P00052842|     M|26-35|         0|            C|                         1|             0|                10|                15|              null|   23961|          1|
|1003160| P00052842|     M|26-35|        17|            C|                         3|             0|                10|                15|              null|   23961|          1|
|1001474| P00052842|     M|26-35|         4|            A|                         2|             1|     

In [157]:
not_count_cat = diff_cat_in_test_train.rdd.map(lambda x: x[0]).collect()

In [158]:
not_count_cat

['P00300142',
 'P00077642',
 'P00092742',
 'P00082142',
 'P00062542',
 'P00013042',
 'P00279042',
 'P00227242',
 'P00359842',
 'P00061642',
 'P0099542',
 'P00306842',
 'P00140842',
 'P00165542',
 'P00268942',
 'P00236842',
 'P00172942',
 'P00012642',
 'P00336842',
 'P00105742',
 'P00309842',
 'P00100242',
 'P00315342',
 'P00168242',
 'P00156942',
 'P00039042',
 'P00056942',
 'P00322642',
 'P00249942',
 'P00294942',
 'P00106242',
 'P00239542',
 'P00074942',
 'P00030342',
 'P00063942',
 'P00042642',
 'P00322842',
 'P00038942',
 'P00270342',
 'P00312642',
 'P00166542',
 'P00082642',
 'P00253842',
 'P00062242',
 'P00058842',
 'P00204642']

In [159]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

Function1=udf(lambda x: '-1' if x in not_count_cat else x, StringType())

In [160]:
k=testDF.withColumn('NewProductId', Function1(testDF['ProductId'])).select('NewProductId')

In [161]:
k.where(k['NewProductId']==-1).show(3)

+------------+
|NewProductId|
+------------+
|          -1|
|          -1|
|          -1|
+------------+
only showing top 3 rows



In [162]:
from pyspark.sql.functions import lit, round, bround
trainDF.select(round(lit('2.5')), bround(lit('2.5'))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [163]:
spark.sql('select round(2.9), round(2.5), bround(2.4), bround(2.9)').show(3)

+-------------+-------------+--------------+--------------+
|round(2.9, 0)|round(2.5, 0)|bround(2.4, 0)|bround(2.9, 0)|
+-------------+-------------+--------------+--------------+
|            3|            3|             2|             3|
+-------------+-------------+--------------+--------------+



In [164]:
spark.sql('select round(2.5), bround(2.5)').show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|            3|             2|
+-------------+--------------+



In [165]:
# String Manipulations

from pyspark.sql.functions import lit, ltrim, rtrim, lpad, rpad, trim
trainDF.select(
    ltrim(lit(" HELLO ")).alias('ltrim'),
    rtrim(lit(" HELLO ")).alias('rtrim'),
    trim(lit(" HELLO ")).alias('trim'),
    lpad(lit("HELLO"),7," ").alias('lpad'),
    rpad(lit("HELLO"),7," ").alias('rpad')
).show(2)

+------+------+-----+-------+-------+
| ltrim| rtrim| trim|   lpad|   rpad|
+------+------+-----+-------+-------+
|HELLO | HELLO|HELLO|  HELLO|HELLO  |
|HELLO | HELLO|HELLO|  HELLO|HELLO  |
+------+------+-----+-------+-------+
only showing top 2 rows



In [166]:
spark.sql('''
    select 
    ltrim('  HELLO  '),
    rtrim('  HELLO  '),
    trim('  HELLO  '),
    lpad('HELLO',3," "),
    rpad('HELLO',10," ")
    from trainDFTable
''').show(3)

+----------------+----------------+---------------+-----------------+------------------+
|ltrim(  HELLO  )|rtrim(  HELLO  )|trim(  HELLO  )|lpad(HELLO, 3,  )|rpad(HELLO, 10,  )|
+----------------+----------------+---------------+-----------------+------------------+
|         HELLO  |           HELLO|          HELLO|              HEL|        HELLO     |
|         HELLO  |           HELLO|          HELLO|              HEL|        HELLO     |
|         HELLO  |           HELLO|          HELLO|              HEL|        HELLO     |
+----------------+----------------+---------------+-----------------+------------------+
only showing top 3 rows



In [167]:
from pyspark.sql.functions import regexp_replace
regex_string='F|M'

trainDF.select(
    regexp_replace(col('Gender'), regex_string, 'MALE_OR_FEMALE').alias('GENDER_DECODE'), col('GENDER')
).show(5)

+--------------+------+
| GENDER_DECODE|GENDER|
+--------------+------+
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     M|
+--------------+------+
only showing top 5 rows



In [184]:
spark.sql('''
    select 
    regexp_replace(Gender, "F|M", "MALE_OR_FEMALE") as DECODE_GENDER, Gender from trainDFTable
''').show(4)

+--------------+------+
| DECODE_GENDER|Gender|
+--------------+------+
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
+--------------+------+
only showing top 4 rows



In [196]:
from pyspark.sql.functions import translate

trainDF.select(
    translate(col("Gender"), "FM", "01"),
    col("Gender")
).show(10)

+-------------------------+------+
|translate(Gender, FM, 01)|Gender|
+-------------------------+------+
|                        0|     F|
|                        0|     F|
|                        0|     F|
|                        0|     F|
|                        1|     M|
|                        1|     M|
|                        1|     M|
|                        1|     M|
|                        1|     M|
|                        1|     M|
+-------------------------+------+
only showing top 10 rows



In [185]:
from pyspark.sql.functions import translate

trainDF.select(
    translate(col("Gender"), "FM", "01").alias("DECODE_GENDER"),
    col("Gender")
).show(10)


+-------------+------+
|DECODE_GENDER|Gender|
+-------------+------+
|            0|     F|
|            0|     F|
|            0|     F|
|            0|     F|
|            1|     M|
|            1|     M|
|            1|     M|
|            1|     M|
|            1|     M|
|            1|     M|
+-------------+------+
only showing top 10 rows



In [170]:
from pyspark.sql.functions import current_date,current_timestamp
df_date =spark.range(10).withColumn("today",current_date()).withColumn("now",current_timestamp())
df_date.show(3,truncate=False)



+---+----------+--------------------------+
|id |today     |now                       |
+---+----------+--------------------------+
|0  |2023-04-13|2023-04-13 10:29:18.837382|
|1  |2023-04-13|2023-04-13 10:29:18.837382|
|2  |2023-04-13|2023-04-13 10:29:18.837382|
+---+----------+--------------------------+
only showing top 3 rows



In [206]:
df_date.createOrReplaceTempView('dataDFTable')

In [207]:
from pyspark.sql.functions import col
from pyspark.sql.functions import date_add,date_sub
df_date.select(date_sub(col("today"),5),date_add(col("today"),5)).show(5)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
+------------------+------------------+
only showing top 5 rows



In [208]:
spark.sql('''
    select 
    date_sub(today, 5),
    date_add(today, 5)
    from dataDFTable
''').show()

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
+------------------+------------------+



In [209]:
from pyspark.sql.functions import datediff, months_between, to_date
df_date.withColumn('week_ago', date_sub(col('today'),7)).select(datediff(col('week_ago'),col('today'))).show(4)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
|                       -7|
|                       -7|
+-------------------------+
only showing top 4 rows



In [210]:
# same as above.
df_date.select(
to_date(lit("2019-02-20")).alias("start"),
to_date(lit("2020-12-10")).alias("end"))\
.select(months_between(col("start"),col("end"))).show(5)

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                    -21.67741935|
|                    -21.67741935|
|                    -21.67741935|
|                    -21.67741935|
|                    -21.67741935|
+--------------------------------+
only showing top 5 rows



In [213]:

spark.sql('''
    select 
    to_date('2019-02-20'), months_between('2019-02-20', '2020-11-10'), datediff('2020-02-20', '2020-11-10')
    from dataDFTable
''').show(5)

+-------------------+--------------------------------------------+--------------------------------+
|to_date(2019-02-20)|months_between(2019-02-20, 2020-11-10, true)|datediff(2020-02-20, 2020-11-10)|
+-------------------+--------------------------------------------+--------------------------------+
|         2019-02-20|                                -20.67741935|                            -264|
|         2019-02-20|                                -20.67741935|                            -264|
|         2019-02-20|                                -20.67741935|                            -264|
|         2019-02-20|                                -20.67741935|                            -264|
|         2019-02-20|                                -20.67741935|                            -264|
+-------------------+--------------------------------------------+--------------------------------+
only showing top 5 rows



In [214]:
df_date.select(to_date(lit('2020-20-11')), to_date(lit('2020-11-20'))).show(2)

+-------------------+-------------------+
|to_date(2020-20-11)|to_date(2020-11-20)|
+-------------------+-------------------+
|               null|         2020-11-20|
|               null|         2020-11-20|
+-------------------+-------------------+
only showing top 2 rows



In [215]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
dateformat="yyyy-dd-MM"
cleandateDF=spark.range(1).select(to_date(unix_timestamp(lit("2020-12-11"),dateformat).cast("timestamp"))\
.alias("date"),
to_date(unix_timestamp(lit('2020-20-11'),dateformat).cast("timestamp")).alias("date_2"))

In [216]:
cleandateDF.show()

+----------+----------+
|      date|    date_2|
+----------+----------+
|2020-11-12|2020-11-20|
+----------+----------+



In [217]:
cleandateDF.printSchema()

root
 |-- date: date (nullable = true)
 |-- date_2: date (nullable = true)



In [218]:
from pyspark.sql.functions import month, year, dayofweek

cleanDF = cleandateDF.withColumn('month_col', month('date_2')) \
                    .withColumn('year_col', year('date_2')) \
                    .withColumn('dayOftheWeek', dayofweek('date_2'))


In [219]:
cleanDF.show()

+----------+----------+---------+--------+------------+
|      date|    date_2|month_col|year_col|dayOftheWeek|
+----------+----------+---------+--------+------------+
|2020-11-12|2020-11-20|       11|    2020|           6|
+----------+----------+---------+--------+------------+



In [220]:
cleanDF.createOrReplaceTempView('dateTable2')
spark.sql("""
    select
    to_date(cast(unix_timestamp(date_2, 'yyyy-dd-MM') as timestamp)), to_date(date)
    from dateTable2
""").show(2)

+--------------------------------------------------------------+-------------+
|to_date(CAST(unix_timestamp(date_2, yyyy-dd-MM) AS TIMESTAMP))|to_date(date)|
+--------------------------------------------------------------+-------------+
|                                                    2020-11-20|   2020-11-12|
+--------------------------------------------------------------+-------------+



In [221]:
textDF=spark.range(10).withColumn('Description', lit('we have a long sentence to be broken'))

In [222]:
my_collection = 'TEK Systems program is Data Engineering and Analytics'.split(' ')
words = spark.sparkContext.parallelize(my_collection)

In [223]:
words.take(3)

['TEK', 'Systems', 'program']

In [224]:
supplementData = {
    'Systems': 100,
    'program': 250,
    'Data': 225,
    'Engineering': -100,
    'Analytics': 1220
}

In [225]:
suppleBroadCast = spark.sparkContext.broadcast(supplementData)

In [226]:
suppleBroadCast.value

{'Systems': 100,
 'program': 250,
 'Data': 225,
 'Engineering': -100,
 'Analytics': 1220}

In [227]:
words.map(lambda x: (x, suppleBroadCast.value.get(x, 0))).sortBy(lambda wordPair: wordPair[1]).collect()

[('Engineering', -100),
 ('TEK', 0),
 ('is', 0),
 ('and', 0),
 ('Systems', 100),
 ('Data', 225),
 ('program', 250),
 ('Analytics', 1220)]

In [228]:
cwgDF = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load('./XXI_Commonwealth_Games.csv')

In [229]:
cwgDF.show(2)

+---+----------+----------+----+------+------+-----+
|Seq|NationCode|NationName|Gold|Silver|Bronze|Total|
+---+----------+----------+----+------+------+-----+
|  1|       AUS| Australia|  60|    45|    46|  151|
|  2|       ENG|   England|  28|    31|    24|   83|
+---+----------+----------+----+------+------+-----+
only showing top 2 rows



In [230]:
cwgDF.printSchema()

root
 |-- Seq: integer (nullable = true)
 |-- NationCode: string (nullable = true)
 |-- NationName: string (nullable = true)
 |-- Gold: integer (nullable = true)
 |-- Silver: integer (nullable = true)
 |-- Bronze: integer (nullable = true)
 |-- Total: integer (nullable = true)



In [231]:
accInd = spark.sparkContext.accumulator(0)

In [232]:
accInd.value

0

In [234]:
def accIndFunc(each_row):
    countryCD=each_row['NationCode']
    list_ctrys=['IND','PAK','SRI','BAN']
    if countryCD in list_ctrys:
        accInd.add(each_row['Total'])

In [235]:
cwgDF.foreach(lambda each_row: accIndFunc(each_row))

In [236]:
accInd.value

38

In [237]:
tcs_bo_df = spark.read.format('csv')\
            .option('header', 'true')\
            .option('inferSchema', 'true')\
            .load('./TCS_BO.csv')

In [239]:
tcs_bo_df.show(5)

+-------------------+---------+---------+---------+---------+---------+------+
|               Date|     Open|     High|      Low|    Close|Adj Close|Volume|
+-------------------+---------+---------+---------+---------+---------+------+
|2002-01-14 00:00:00|38.500000|39.500000|38.062500|38.400002|20.948002| 83688|
|2002-01-15 00:00:00|38.112499|38.724998|37.150002|37.412498|20.409311| 47496|
|2002-01-16 00:00:00|38.049999|38.500000|37.125000|37.700001|20.566145| 51624|
|2002-01-17 00:00:00|36.250000|38.750000|36.250000|38.337502|20.913918| 85840|
|2002-01-18 00:00:00|38.750000|39.974998|38.150002|38.549999|21.029835| 78928|
+-------------------+---------+---------+---------+---------+---------+------+
only showing top 5 rows



In [240]:
tcs_bo_df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



In [241]:
tcs_bo_df = tcs_bo_df.select(
    col('Date').cast('date'),
    col('Open').cast('double'),
    col('High').cast('double'),
    col('Low').cast('double'),
    col('Close').cast('double'),
    col('Adj Close').cast('double'),
    col('Volume').cast('int'),
)

In [242]:
tcs_bo_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [243]:
tcs_bo_df = tcs_bo_df.withColumnRenamed('Adj Close', 'AdjClose')
tcs_bo_df = tcs_bo_df.withColumnRenamed('date', 'StockDate')

In [256]:
# tcs_bo_df.write.format('json').mode('overwrite').save('./tcs_bo.json')

In [257]:
# tcs_bo_df.write.format('parquet').mode('overwrite').save('./tcs_bo.parquet')

In [252]:
tcs_bo_json_df = spark.read.format('json')\
                .option('inferSchema','true')\
                .load('./part-00000-0f4cd396-8c43-4cb5-aef4-809b7d2f5536-c000.json')

In [253]:
tcs_bo_json_df.show(4)

+---------+---------+---------+---------+---------+----------+------+
| AdjClose|    Close|     High|      Low|     Open|Stock_Date|Volume|
+---------+---------+---------+---------+---------+----------+------+
|20.948002|38.400002|     39.5|  38.0625|     38.5|2002-01-14| 83688|
|20.409311|37.412498|38.724998|37.150002|38.112499|2002-01-15| 47496|
|20.566145|37.700001|     38.5|   37.125|38.049999|2002-01-16| 51624|
|20.913918|38.337502|    38.75|    36.25|    36.25|2002-01-17| 85840|
+---------+---------+---------+---------+---------+----------+------+
only showing top 4 rows



In [254]:
tcs_bo_parquet_df = spark.read.format('parquet')\
                .option('inferSchema','true')\
                .load('./part-00000-bfe36681-0e42-4820-8ed1-58d386cab0a8-c000.snappy.parquet')

In [255]:
tcs_bo_parquet_df.show(4)

+----------+---------+---------+---------+---------+---------+------+
|Stock_Date|     Open|     High|      Low|    Close| AdjClose|Volume|
+----------+---------+---------+---------+---------+---------+------+
|2002-01-14|     38.5|     39.5|  38.0625|38.400002|20.948002| 83688|
|2002-01-15|38.112499|38.724998|37.150002|37.412498|20.409311| 47496|
|2002-01-16|38.049999|     38.5|   37.125|37.700001|20.566145| 51624|
|2002-01-17|    36.25|    38.75|    36.25|38.337502|20.913918| 85840|
+----------+---------+---------+---------+---------+---------+------+
only showing top 4 rows

