## Spark SQL

### Block diagram - Double Click to expand
<!--    
+----------------------------------------------+
|                  User Program                |
+----------------------------------------------+
                        |
                        v
+----------------+    +-----------------------+
|    Data Source  |    |      SparkSession      |
+----------------+    +-----------------------+
                        |
                        v
+----------------+    +-----------------------+
|  JDBC/ODBC API |    | Spark SQL Engine / API |
+----------------+    +-----------------------+
                        |
                        v
+----------------+    +-----------------------+
|     Dataset    |    |    DataFrame / SQL     |
+----------------+    +-----------------------+
                        |
                        v
+----------------+    +-----------------------+
|    Catalyst     |    |  Spark Core / Cluster  |
|    Optimizer    |    +-----------------------+
+----------------+               |
                                 v
+----------------------------------------------+
|                    RDDs                      |
+----------------------------------------------+

 -->

In [1]:
!pip install findspark
import findspark
findspark.init()



In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('SparkSQL_UseCase').master('local[2]').getOrCreate()

In [4]:
spark

In [5]:
rangeDF=spark.range(100).toDF('number')

In [6]:
rangeDF

DataFrame[number: bigint]

In [7]:
rangeDF.show(4)

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
+------+
only showing top 4 rows



In [8]:
rangeDF.count()

100

In [9]:
evenDF=rangeDF.where('number%2==0')

In [10]:
evenDF

DataFrame[number: bigint]

In [11]:
evenDF.show(5)

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+
only showing top 5 rows



In [12]:
# spark.createDataFrame function

nameDF=spark.createDataFrame([[1,'Alice',30,'Female'],
                              [2,'Beneth',30,'Male'],
                              [3,'Charlie',30,'Male'],
                              [4,'Dharan',30,'Male']],['Id','Name','Age','Gender'])

In [13]:
nameDF.show(3)

+---+-------+---+------+
| Id|   Name|Age|Gender|
+---+-------+---+------+
|  1|  Alice| 30|Female|
|  2| Beneth| 30|  Male|
|  3|Charlie| 30|  Male|
+---+-------+---+------+
only showing top 3 rows



In [14]:
# constructing DataFrame from RDD

sc=spark.sparkContext

In [15]:
sc

In [16]:
# infer schema by using RDD

import os
os.getcwd()

'C:\\Users\\pdharantej\\OneDrive - ALLEGIS GROUP\\Desktop\\TEK_Training\\5. Data Analysis'

In [17]:
tempRDD=sc.textFile('./temp_data.txt')

In [18]:
tempRDD.count()

13131

In [19]:
type(tempRDD)

pyspark.rdd.RDD

In [20]:
tempRDD.take(3)

['1901\t-78\t1', '1901\t-72\t1', '1901\t-94\t1']

In [21]:
splitRDD=tempRDD.map(lambda record: record.split('\t'))
splitRDD.take(3)

[['1901', '-78', '1'], ['1901', '-72', '1'], ['1901', '-94', '1']]

In [22]:
# constructing the RDD using the Row object

from pyspark.sql import Row

In [23]:
schemaRDD=splitRDD.map(lambda line: Row(year=line[0],temp=int(line[1]),status=int(line[2])))

In [24]:
schemaRDD.take(3)

[Row(year='1901', temp=-78, status=1),
 Row(year='1901', temp=-72, status=1),
 Row(year='1901', temp=-94, status=1)]

In [25]:
tempDF=spark.createDataFrame(schemaRDD)
tempDF.show(3)

+----+----+------+
|year|temp|status|
+----+----+------+
|1901| -78|     1|
|1901| -72|     1|
|1901| -94|     1|
+----+----+------+
only showing top 3 rows



In [26]:
tempDF.head(3)

[Row(year='1901', temp=-78, status=1),
 Row(year='1901', temp=-72, status=1),
 Row(year='1901', temp=-94, status=1)]

In [27]:
tempDF.printSchema()

root
 |-- year: string (nullable = true)
 |-- temp: long (nullable = true)
 |-- status: long (nullable = true)



In [28]:
# reading a csv file as an RDD and then building the RDD as a dataframe

# # read test.csv as RDD and convert it to dataframe
testRDD=sc.textFile('./test.csv')
testRDD.count()

233600

In [29]:
testRDD.take(3)

['User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3',
 '1000004,P00128942,M,46-50,7,B,2,1,1,11,',
 '1000009,P00113442,M,26-35,17,C,0,0,3,5,']

In [30]:
header=testRDD.first()

In [31]:
testRDD=testRDD.filter(lambda line: line!=header)

In [32]:
print('After the header record is removed ')
testRDD.first()

After the header record is removed 


'1000004,P00128942,M,46-50,7,B,2,1,1,11,'

In [33]:
# split data based on the separator
splitRDD=testRDD.map(lambda line: line.split(','))
print('After splitting the records are : \n')
splitRDD.take(2)

After splitting the records are : 



[['1000004', 'P00128942', 'M', '46-50', '7', 'B', '2', '1', '1', '11', ''],
 ['1000009', 'P00113442', 'M', '26-35', '17', 'C', '0', '0', '3', '5', '']]

In [34]:
from pyspark.sql.types import *
testRDDSchema=StructType([
    StructField('User_Id', StringType(),True),
    StructField('ProductId', StringType(),True),
    StructField('Gender', StringType(),True),
    StructField('Age', StringType(),True),
    StructField('Occupation', StringType(),True),
    StructField('City_Category', StringType(),True),
    StructField('Stay_In_Current_City_Years', StringType(),True),
    StructField('Marital_Status', StringType(),True),
    StructField('Product_Category_1', StringType(),True),
    StructField('Product_Category_2', StringType(),True),
    StructField('Product_Category_3', StringType(),True),
])

In [35]:
testDF=spark.createDataFrame(data=splitRDD,schema=testRDDSchema)

In [36]:
testDF.show(2)

+-------+---------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+
|User_Id|ProductId|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|
+-------+---------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+
|1000004|P00128942|     M|46-50|         7|            B|                         2|             1|                 1|                11|                  |
|1000009|P00113442|     M|26-35|        17|            C|                         0|             0|                 3|                 5|                  |
+-------+---------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+
only showing top 2 rows



In [37]:
testDF.count()

233599

In [38]:
testSample=testDF.sample(False,0.1,98) # <without_duplication, sample_percentage, seed_value>

In [39]:
testSample.count()

23403

In [40]:
trainDF=spark.read.format('csv').option('header','true').option('inferSchema','true').load('./train.csv')

In [41]:
trainDF

DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int]

In [42]:
trainDF.show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
|1000001| P00087842|     F| 0-17|        10|            A|                         2|             0|                12|              null|              null|    1422

In [43]:
trainSample=trainDF.sample(False, 0.1, 192)
trainSample.count()

54729

In [44]:
trainSamplePD=trainSample.toPandas()
print(type(trainSample))
print(type(trainSamplePD))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [45]:
# trainSamplePD = trainSamplePD.set_index('User_ID')

In [46]:
trainSamplePD.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000005,P00031342,M,26-35,20,A,1,1,8,,,6073
1,1000006,P00231342,F,51-55,9,A,1,0,5,8.0,14.0,5378
2,1000006,P00190242,F,51-55,9,A,1,0,4,5.0,,2079
3,1000008,P00220442,M,26-35,12,C,4+,1,5,14.0,,8584
4,1000010,P00297942,F,36-45,1,B,4+,1,8,,,5875


In [47]:
trainSamplePD.to_csv('./2023_train_sample.csv')

In [48]:
trainDF.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)



In [49]:
trainDF.head(2)

[Row(User_ID=1000001, Product_ID='P00069042', Gender='F', Age='0-17', Occupation=10, City_Category='A', Stay_In_Current_City_Years='2', Marital_Status=0, Product_Category_1=3, Product_Category_2=None, Product_Category_3=None, Purchase=8370),
 Row(User_ID=1000001, Product_ID='P00248942', Gender='F', Age='0-17', Occupation=10, City_Category='A', Stay_In_Current_City_Years='2', Marital_Status=0, Product_Category_1=1, Product_Category_2=6, Product_Category_3=14, Purchase=15200)]

In [50]:
trainDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
only

In [51]:
print('Number of records in Training Dataset {}'.format(trainDF.count()))
print('Number of records in Testing Dataset {}'.format(testDF.count()))

Number of records in Training Dataset 550068
Number of records in Testing Dataset 233599


In [52]:
trainDF.describe().show()

+-------+------------------+----------+------+------+-----------------+-------------+--------------------------+-------------------+------------------+------------------+------------------+-----------------+
|summary|           User_ID|Product_ID|Gender|   Age|       Occupation|City_Category|Stay_In_Current_City_Years|     Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|         Purchase|
+-------+------------------+----------+------+------+-----------------+-------------+--------------------------+-------------------+------------------+------------------+------------------+-----------------+
|  count|            550068|    550068|550068|550068|           550068|       550068|                    550068|             550068|            550068|            376430|            166821|           550068|
|   mean|1003028.8424013031|      null|  null|  null|8.076706879876669|         null|         1.468494139793958|0.40965298835780306| 5.404270017525106| 9.84232925112238

In [53]:
testDF.describe().show()

+-------+------------------+---------+------+------+-----------------+-------------+--------------------------+------------------+------------------+------------------+------------------+
|summary|           User_Id|ProductId|Gender|   Age|       Occupation|City_Category|Stay_In_Current_City_Years|    Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|
+-------+------------------+---------+------+------+-----------------+-------------+--------------------------+------------------+------------------+------------------+------------------+
|  count|            233599|   233599|233599|233599|           233599|       233599|                    233599|            233599|            233599|            233599|            233599|
|   mean|1003029.3568594044|     null|  null|  null|8.085407043694536|         null|        1.4682778997642345|0.4100702485883929| 5.276542279718663| 9.849586059346997|12.669453946534905|
| stddev|  1726.50496799554|     null|  null|  null|6.521146

In [54]:
trainDF.describe('Purchase').show()

+-------+-----------------+
|summary|         Purchase|
+-------+-----------------+
|  count|           550068|
|   mean|9263.968712959126|
| stddev|5023.065393820575|
|    min|               12|
|    max|            23961|
+-------+-----------------+



In [55]:
trainDF.createOrReplaceTempView('trainDFTable')

In [56]:
trainDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
only

In [57]:
spark.sql("select * from trainDFTable limit 2")

DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int]

In [58]:
spark.sql("select * from trainDFTable limit 2").show()

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+



In [59]:
dataFrameWay=trainDF.groupBy('Age').count()
dataFrameWay

DataFrame[Age: string, count: bigint]

In [60]:
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Age#183], functions=[count(1)])
   +- Exchange hashpartitioning(Age#183, 200), ENSURE_REQUIREMENTS, [plan_id=485]
      +- HashAggregate(keys=[Age#183], functions=[partial_count(1)])
         +- FileScan csv [Age#183] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/pdharantej/OneDrive - ALLEGIS GROUP/Desktop/TEK_Trainin..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Age:string>




In [61]:
sqlWay=spark.sql('select Age, count(1) from trainDFTable group by Age')

In [62]:
sqlWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[Age#183], functions=[count(1)])
   +- Exchange hashpartitioning(Age#183, 200), ENSURE_REQUIREMENTS, [plan_id=498]
      +- HashAggregate(keys=[Age#183], functions=[partial_count(1)])
         +- FileScan csv [Age#183] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/pdharantej/OneDrive - ALLEGIS GROUP/Desktop/TEK_Trainin..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Age:string>




In [63]:
# The above two statements give the same output 

In [64]:
from pyspark.sql.functions import expr, col, column
dfWay=trainDF.filter(col('Age')!='0-17').groupBy('Age').count()

In [65]:
dfWay.show()

+-----+------+
|  Age| count|
+-----+------+
|18-25| 99660|
|26-35|219587|
|46-50| 45701|
|51-55| 38501|
|36-45|110013|
|  55+| 21504|
+-----+------+



In [70]:
sqlWay=spark.sql('select Age from trainDFTable')

In [71]:
trainDF.select(expr('User_Id as userID'),col('User_ID'),'User_ID').show(3)

+-------+-------+-------+
| userID|User_ID|User_ID|
+-------+-------+-------+
|1000001|1000001|1000001|
|1000001|1000001|1000001|
|1000001|1000001|1000001|
+-------+-------+-------+
only showing top 3 rows



In [72]:
trainDF.select(expr('User_ID as user')).show(2)

+-------+
|   user|
+-------+
|1000001|
|1000001|
+-------+
only showing top 2 rows



In [73]:
spark.sql('select User_ID as userID from trainDFTable').show(2)

+-------+
| userID|
+-------+
|1000001|
|1000001|
+-------+
only showing top 2 rows



In [74]:
trainDF.selectExpr('User_ID as userId', 'product_ID as productID').show(2)

+-------+---------+
| userId|productID|
+-------+---------+
|1000001|P00069042|
|1000001|P00248942|
+-------+---------+
only showing top 2 rows



In [75]:
trainDF.select('User_ID', 'product_ID', 'Age').show()

+-------+----------+-----+
|User_ID|product_ID|  Age|
+-------+----------+-----+
|1000001| P00069042| 0-17|
|1000001| P00248942| 0-17|
|1000001| P00087842| 0-17|
|1000001| P00085442| 0-17|
|1000002| P00285442|  55+|
|1000003| P00193542|26-35|
|1000004| P00184942|46-50|
|1000004| P00346142|46-50|
|1000004|  P0097242|46-50|
|1000005| P00274942|26-35|
|1000005| P00251242|26-35|
|1000005| P00014542|26-35|
|1000005| P00031342|26-35|
|1000005| P00145042|26-35|
|1000006| P00231342|51-55|
|1000006| P00190242|51-55|
|1000006|  P0096642|51-55|
|1000006| P00058442|51-55|
|1000007| P00036842|36-45|
|1000008| P00249542|26-35|
+-------+----------+-----+
only showing top 20 rows



In [76]:
from pyspark.sql.functions import lit
trainDF.select('*', lit(1).alias('ConstantOne')).show(3)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [77]:
# pyspark datastructures are immutable 

In [84]:
spark.sql('select *, 1 as ConstantOne from trainDFTable limit 3').show()

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [85]:
trainDF=trainDF.withColumn('ConstantOne', lit(1))

In [87]:
trainDF.show(3)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [88]:
trainDF.withColumn('OccuopationOne', trainDF.Occupation+1).show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+--------------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|OccuopationOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+--------------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|            11|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|            11|
|1000001| P00087842|     F| 0-

In [90]:
tempDF=trainDF.withColumn('SameCategoryCode',
                          trainDF['Product_Category_1']==trainDF['Product_Category_2'])
tempDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|SameCategoryCode|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|            null|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|           false|
+-------+----------+-----

In [91]:
tempDF.filter(col('SameCategoryCode')==False).show(3)

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|SameCategoryCode|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+----------------+
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|           false|
|1000001| P00085442|     F| 0-17|        10|            A|                         2|             0|                12|                14|              null|    1057|          1|           false|
|1000003| P00193542|

In [94]:
tempDF.withColumnRenamed('SameCategoryCode', 'SimilarCategoryCode').show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+-------------------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|SimilarCategoryCode|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+-------------------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|               null|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|              false|
|1000

In [95]:
tempDF.drop('SameCategoryCode').show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F| 0-17|        10|            A|                         2|             0|     

In [96]:
tempDF.printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: integer (nullable = true)
 |-- ConstantOne: integer (nullable = false)
 |-- SameCategoryCode: boolean (nullable = true)



In [98]:
tempDF.withColumn('Purchase', col('Purchase').cast('String')).printSchema()

root
 |-- User_ID: integer (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- City_Category: string (nullable = true)
 |-- Stay_In_Current_City_Years: string (nullable = true)
 |-- Marital_Status: integer (nullable = true)
 |-- Product_Category_1: integer (nullable = true)
 |-- Product_Category_2: integer (nullable = true)
 |-- Product_Category_3: integer (nullable = true)
 |-- Purchase: string (nullable = true)
 |-- ConstantOne: integer (nullable = false)
 |-- SameCategoryCode: boolean (nullable = true)



In [106]:
trainDF.select('Product_ID').distinct().count()

3631

In [107]:
testDF=testDF.withColumn('Product_ID', col('ProductID'))

In [108]:
testDF.select('Product_ID').distinct().count()

3491

In [109]:
diff_cat_in_test_train=testDF.select('Product_ID').subtract(trainDF.select('Product_ID'))
diff_cat_in_test_train.count()

46

In [116]:
diff_cat_in_train_test=trainDF.select('Product_ID').subtract(testDF.select('Product_ID'))
diff_cat_in_train_test.count()

186

In [121]:
trainDF.crosstab('Age', 'Gender').show()

+----------+-----+------+
|Age_Gender|    F|     M|
+----------+-----+------+
|      0-17| 5083| 10019|
|     46-50|13199| 32502|
|     18-25|24628| 75032|
|     36-45|27170| 82843|
|       55+| 5083| 16421|
|     51-55| 9894| 28607|
|     26-35|50752|168835|
+----------+-----+------+



In [123]:
trainDF.crosstab('Gender', 'Age').show()

+----------+-----+-----+------+-----+-----+-----+-----+
|Gender_Age| 0-17|18-25| 26-35|36-45|46-50|51-55|  55+|
+----------+-----+-----+------+-----+-----+-----+-----+
|         F| 5083|24628| 50752|27170|13199| 9894| 5083|
|         M|10019|75032|168835|82843|32502|28607|16421|
+----------+-----+-----+------+-----+-----+-----+-----+



In [125]:
trainDF.groupBy('Age', 'Gender').count().show()

+-----+------+------+
|  Age|Gender| count|
+-----+------+------+
|51-55|     F|  9894|
|18-25|     M| 75032|
| 0-17|     F|  5083|
|46-50|     M| 32502|
|18-25|     F| 24628|
|  55+|     M| 16421|
|  55+|     F|  5083|
|36-45|     M| 82843|
|26-35|     F| 50752|
| 0-17|     M| 10019|
|36-45|     F| 27170|
|51-55|     M| 28607|
|26-35|     M|168835|
|46-50|     F| 13199|
+-----+------+------+



In [126]:
trainDF.groupBy('Gender', 'Age').count().show()

+------+-----+------+
|Gender|  Age| count|
+------+-----+------+
|     F|46-50| 13199|
|     M| 0-17| 10019|
|     M|26-35|168835|
|     M|51-55| 28607|
|     M|18-25| 75032|
|     M|  55+| 16421|
|     F|51-55|  9894|
|     F|36-45| 27170|
|     F|18-25| 24628|
|     F|  55+|  5083|
|     M|36-45| 82843|
|     F| 0-17|  5083|
|     M|46-50| 32502|
|     F|26-35| 50752|
+------+-----+------+



In [131]:
spark.sql('''
    select age, 
    sum(case when gender='F' then 1 else 0 end) as Female,
    sum(case when gender='M' then 1 else 0 end) as Male
    from trainDFTable
    group by age
''').show()

+-----+------+------+
|  age|Female|  Male|
+-----+------+------+
|18-25| 24628| 75032|
|26-35| 50752|168835|
| 0-17|  5083| 10019|
|46-50| 13199| 32502|
|51-55|  9894| 28607|
|36-45| 27170| 82843|
|  55+|  5083| 16421|
+-----+------+------+



In [132]:
trainDF.select('Age', 'Gender').dropDuplicates().show(20)

+-----+------+
|  Age|Gender|
+-----+------+
|51-55|     F|
|18-25|     M|
| 0-17|     F|
|46-50|     M|
|18-25|     F|
|  55+|     M|
|  55+|     F|
|36-45|     M|
|26-35|     F|
| 0-17|     M|
|36-45|     F|
|51-55|     M|
|26-35|     M|
|46-50|     F|
+-----+------+



In [133]:
trainDF.dropna().count()

166821

In [134]:
trainDF.na.drop('any').count()
# any column has the null values, the whole row gets to drop

166821

In [135]:
trainDF.na.drop().count()

166821

In [136]:
trainDF.fillna(-1).show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F| 0-17|        10|            A|                         2|             0|                 3|                -1|                -1|    8370|          1|
|1000001| P00248942|     F| 0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F| 0-17|        10|            A|                         2|             0|     

In [138]:
trainDF.fillna().show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+-----------

In [139]:
fill_col_values={
    "Gender": 'M',
    'Purchase': 9999999,
    'Product_Category_3': -1
}
trainDF.na.fill(fill_col_values).show(3)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|                -1|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
|1000001| P00087842|     F|0-17|        10|            A|                         2|             0|           

In [140]:
trainDF.na.replace([''], ['UNKNOWN'], ['Gender']).count()

550068

In [141]:
print('Purchase amount greater than 15000 {}')

Purchase amount greater than 15000 {}


In [142]:
trainDF.where('Purchase > 15000').where('Gender = "F"').count()

21429

In [143]:
trainDF.filter('Purchase > 15000').where('Gender = "F"').count()

21429

In [144]:
trainDF.where((col('Purchase') > 15000) & (col('Gender')=='F')).count()

21429

In [145]:
trainDF.filter((col('Purchase') > 15000) & (col('Gender')=='F')).count()

21429

In [146]:
spark.sql('select * from trainDFTable where Purchase > 15000 and Gender = "F"').count()

21429

In [147]:
from pyspark.sql.functions import countDistinct
trainDF.select(countDistinct('Age').alias('DISTINCT_Age')).show()

+------------+
|DISTINCT_Age|
+------------+
|           7|
+------------+



In [148]:
trainDF.agg(countDistinct('Age').alias('DISTINCT_Age')).show()

+------------+
|DISTINCT_Age|
+------------+
|           7|
+------------+



In [149]:
from pyspark.sql import functions as func

In [151]:
trainDF.show(2)

+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender| Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1000001| P00069042|     F|0-17|        10|            A|                         2|             0|                 3|              null|              null|    8370|          1|
|1000001| P00248942|     F|0-17|        10|            A|                         2|             0|                 1|                 6|                14|   15200|          1|
+-------+----------+------+----+----------+-------------+--------------------------+--------------+-----------

In [155]:
trainDF.agg(func.min('Age'), func.avg('Purchase')).show()

+--------+-----------------+
|min(Age)|    avg(Purchase)|
+--------+-----------------+
|    0-17|9263.968712959126|
+--------+-----------------+



In [156]:
trainDF.groupBy('Age').count().show()

+-----+------+
|  Age| count|
+-----+------+
|18-25| 99660|
|26-35|219587|
| 0-17| 15102|
|46-50| 45701|
|51-55| 38501|
|36-45|110013|
|  55+| 21504|
+-----+------+



In [157]:
from pyspark.sql.functions import approx_count_distinct

In [158]:
?approx_count_distinct

In [159]:
trainDF.select(approx_count_distinct('Age',0.1)).show()

+--------------------------+
|approx_count_distinct(Age)|
+--------------------------+
|                         7|
+--------------------------+



In [170]:
from pyspark.sql.functions import first, last
trainDF.select(first('Product_ID',True),last('Product_ID',True)).show()

+-----------------+----------------+
|first(Product_ID)|last(Product_ID)|
+-----------------+----------------+
|        P00069042|       P00371644|
+-----------------+----------------+



In [172]:
from pyspark.sql.functions import min, max
trainDF.select(min('Purchase'),max('Purchase')).show()

+-------------+-------------+
|min(Purchase)|max(Purchase)|
+-------------+-------------+
|           12|        23961|
+-------------+-------------+



In [173]:
from pyspark.sql.functions import sum, sum_distinct

In [174]:
trainDF.select(sum('Purchase')).show()

+-------------+
|sum(Purchase)|
+-------------+
|   5095812742|
+-------------+



In [176]:
trainDF.select(sum_distinct('Purchase')).show()

+----------------------+
|sum(DISTINCT Purchase)|
+----------------------+
|             208520914|
+----------------------+



In [177]:
from pyspark.sql.functions import sum, count, avg, expr
trainDF.select(
    count('Purchase').alias('total_transactions'),
    sum('Purchase').alias('total_purchases'),
    avg('Purchase').alias('avg_purchases'),
    expr('mean(Purchase)').alias('mean_purchases')
).selectExpr(
    'total_purchases/total_transactions', 'avg_purchases', 'mean_purchases'
).show()

+--------------------------------------+-----------------+-----------------+
|(total_purchases / total_transactions)|    avg_purchases|   mean_purchases|
+--------------------------------------+-----------------+-----------------+
|                     9263.968712959126|9263.968712959126|9263.968712959126|
+--------------------------------------+-----------------+-----------------+



In [178]:
from pyspark.sql.functions import var_pop, var_samp, stddev_pop, stddev_samp
trainDF.select(var_pop('Purchase'), var_samp('Purchase'), stddev_pop('Purchase'), stddev_samp('Purchase')).show()

+--------------------+-------------------+--------------------+---------------------+
|   var_pop(Purchase)| var_samp(Purchase)|stddev_pop(Purchase)|stddev_samp(Purchase)|
+--------------------+-------------------+--------------------+---------------------+
|2.5231140081385408E7|2.523118595059785E7|   5023.060827959921|    5023.065393820575|
+--------------------+-------------------+--------------------+---------------------+



In [179]:
from pyspark.sql.functions import collect_list, collect_set
trainDF.agg(collect_set('Age'), collect_list('Age')).show()

+--------------------+--------------------+
|    collect_set(Age)|   collect_list(Age)|
+--------------------+--------------------+
|[55+, 51-55, 0-17...|[0-17, 0-17, 0-17...|
+--------------------+--------------------+



In [180]:
spark.sql('select collect_set(Age), collect_list(Age) from trainDFTable').show()

+--------------------+--------------------+
|    collect_set(Age)|   collect_list(Age)|
+--------------------+--------------------+
|[55+, 51-55, 0-17...|[0-17, 0-17, 0-17...|
+--------------------+--------------------+



In [181]:
trainDF.groupBy('Age').agg(count('Purchase').alias('quan'),
                          expr('count(Purchase)')).show()

+-----+------+---------------+
|  Age|  quan|count(Purchase)|
+-----+------+---------------+
|18-25| 99660|          99660|
|26-35|219587|         219587|
| 0-17| 15102|          15102|
|46-50| 45701|          45701|
|51-55| 38501|          38501|
|36-45|110013|         110013|
|  55+| 21504|          21504|
+-----+------+---------------+



In [182]:
trainDF.groupBy('Age').agg({'Purchase':'mean'}).show()

+-----+-----------------+
|  Age|    avg(Purchase)|
+-----+-----------------+
|18-25|9169.663606261289|
|26-35|9252.690632869888|
| 0-17|8933.464640444974|
|46-50|9208.625697468327|
|51-55|9534.808030960236|
|36-45|9331.350694917874|
|  55+|9336.280459449405|
+-----+-----------------+



In [183]:
trainDF.groupBy('Age').agg({'Purchase':'sum'}).show()

+-----+-------------+
|  Age|sum(Purchase)|
+-----+-------------+
|18-25|    913848675|
|26-35|   2031770578|
| 0-17|    134913183|
|46-50|    420843403|
|51-55|    367099644|
|36-45|   1026569884|
|  55+|    200767375|
+-----+-------------+



In [184]:
exprs={x: 'sum' for x in trainDF.columns}
trainDF.groupBy('Age').agg(exprs).show()

+-----+------------------+-----------------------+-------------------+-------------+------------+---------------+-------------------------------+-----------------------+--------+----------------+-----------+-----------------------+---------------+
|  Age|sum(City_Category)|sum(Product_Category_3)|sum(Marital_Status)|sum(Purchase)|sum(User_ID)|sum(Occupation)|sum(Stay_In_Current_City_Years)|sum(Product_Category_1)|sum(Age)|sum(ConstantOne)|sum(Gender)|sum(Product_Category_2)|sum(Product_ID)|
+-----+------------------+-----------------------+-------------------+-------------+------------+---------------+-------------------------------+-----------------------+--------+----------------+-----------+-----------------------+---------------+
|18-25|              null|                 388041|              21116|    913848675| 99939196632|         671348|                       116997.0|                 509371|    null|           99660|       null|                 654936|           null|
|26-35| 

In [186]:
### Joins

person=spark.createDataFrame(
    [
        (0,'Sundar Pichai',0,[250,100]),
        (1,'Sergery',1,[500,250,100]),
        (2,'William Oak',2,[100])
    ]
).toDF('id', 'name', 'graduate_program', 'role_status')


In [187]:
person

DataFrame[id: bigint, name: string, graduate_program: bigint, role_status: array<bigint>]

In [188]:
person.show()

+---+-------------+----------------+---------------+
| id|         name|graduate_program|    role_status|
+---+-------------+----------------+---------------+
|  0|Sundar Pichai|               0|     [250, 100]|
|  1|      Sergery|               1|[500, 250, 100]|
|  2|  William Oak|               2|          [100]|
+---+-------------+----------------+---------------+



In [189]:
graduateProgram = spark.createDataFrame(
    [
        (0, 'MBA', 'School of MBA', 'Penn State University'),
        (1, 'Ph.D', 'Computer Science', 'Stanford University'),
        (2, 'Ph.D', 'School of Information', 'Oklhama University')
    ]
).toDF('id', 'degree', 'dept', 'school')

In [190]:
graduateProgram

DataFrame[id: bigint, degree: string, dept: string, school: string]

In [193]:
graduateProgram.show(truncate=False)

+---+------+---------------------+---------------------+
|id |degree|dept                 |school               |
+---+------+---------------------+---------------------+
|0  |MBA   |School of MBA        |Penn State University|
|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |Ph.D  |School of Information|Oklhama University   |
+---+------+---------------------+---------------------+



In [194]:
roleStatus=spark.createDataFrame(
    [
        (500, 'President'),
        (250, 'Founder'),
        (100, 'Director')
    ]
).toDF('id', 'status')

In [195]:
roleStatus

DataFrame[id: bigint, status: string]

In [196]:
roleStatus.show(truncate=False)

+---+---------+
|id |status   |
+---+---------+
|500|President|
|250|Founder  |
|100|Director |
+---+---------+



In [197]:
# tempView is for one spark session
# globalView is when you have multiple spark sessions
person.createOrReplaceTempView('personTbl')
graduateProgram.createOrReplaceTempView('graduateProgramTbl')
roleStatus.createOrReplaceTempView('roleStatusTbl')

In [200]:
joinExpression=person['graduate_program'] == graduateProgram['id']
person.join(graduateProgram, joinExpression).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [201]:
spark.sql('''
    select * from personTbl join graduateProgramTbl
    on personTbl.graduate_program=graduateProgramTbl.id
''').show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [202]:
joinType='inner'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [203]:
joinType='outer'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [204]:
joinType='left_outer'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [205]:
joinType='right_outer'
person.join(graduateProgram, joinExpression, joinType).show(truncate=False)

+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|id |name         |graduate_program|role_status    |id |degree|dept                 |school               |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+
|0  |Sundar Pichai|0               |[250, 100]     |0  |MBA   |School of MBA        |Penn State University|
|1  |Sergery      |1               |[500, 250, 100]|1  |Ph.D  |Computer Science     |Stanford University  |
|2  |William Oak  |2               |[100]          |2  |Ph.D  |School of Information|Oklhama University   |
+---+-------------+----------------+---------------+---+------+---------------------+---------------------+



In [207]:
from pyspark.sql.functions import expr
person.withColumnRenamed('id', 'personId').join(roleStatus, expr('array_contains(role_status, id)')).show(truncate=False)

+--------+-------------+----------------+---------------+---+---------+
|personId|name         |graduate_program|role_status    |id |status   |
+--------+-------------+----------------+---------------+---+---------+
|0       |Sundar Pichai|0               |[250, 100]     |250|Founder  |
|0       |Sundar Pichai|0               |[250, 100]     |100|Director |
|1       |Sergery      |1               |[500, 250, 100]|500|President|
|1       |Sergery      |1               |[500, 250, 100]|250|Founder  |
|1       |Sergery      |1               |[500, 250, 100]|100|Director |
|2       |William Oak  |2               |[100]          |100|Director |
+--------+-------------+----------------+---------------+---+---------+



In [209]:
spark.sql('''
    select * from 
    (select id as personId, name, graduate_program, role_status from personTbl)
    inner join roleStatusTbl on array_contains(role_status, id)
''').show(truncate=False)

+--------+-------------+----------------+---------------+---+---------+
|personId|name         |graduate_program|role_status    |id |status   |
+--------+-------------+----------------+---------------+---+---------+
|0       |Sundar Pichai|0               |[250, 100]     |250|Founder  |
|0       |Sundar Pichai|0               |[250, 100]     |100|Director |
|1       |Sergery      |1               |[500, 250, 100]|500|President|
|1       |Sergery      |1               |[500, 250, 100]|250|Founder  |
|1       |Sergery      |1               |[500, 250, 100]|100|Director |
|2       |William Oak  |2               |[100]          |100|Director |
+--------+-------------+----------------+---------------+---+---------+



In [210]:
trainDF.count()

550068

In [211]:
sample1=trainDF.sample(False, 0.1, 1234)
sample2=trainDF.sample(False, 0.1, 2345)

In [212]:
sample1.count()

55488

In [213]:
sample2.count()

54712

In [214]:
splitDF=trainDF.randomSplit([0.7, 0.3], seed=8787)

In [216]:
type(splitDF)

list

In [217]:
splitDF

[DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int, ConstantOne: int],
 DataFrame[User_ID: int, Product_ID: string, Gender: string, Age: string, Occupation: int, City_Category: string, Stay_In_Current_City_Years: string, Marital_Status: int, Product_Category_1: int, Product_Category_2: int, Product_Category_3: int, Purchase: int, ConstantOne: int]]

In [218]:
trainDF.select('User_ID').rdd.map(lambda x: (x, 1)).take(5)

[(Row(User_ID=1000001), 1),
 (Row(User_ID=1000001), 1),
 (Row(User_ID=1000001), 1),
 (Row(User_ID=1000001), 1),
 (Row(User_ID=1000002), 1)]

In [219]:
trainDF.orderBy(trainDF.Purchase.desc()).show()

+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|User_ID|Product_ID|Gender|  Age|Occupation|City_Category|Stay_In_Current_City_Years|Marital_Status|Product_Category_1|Product_Category_2|Product_Category_3|Purchase|ConstantOne|
+-------+----------+------+-----+----------+-------------+--------------------------+--------------+------------------+------------------+------------------+--------+-----------+
|1002272| P00052842|     M|26-35|         0|            C|                         1|             0|                10|                15|              null|   23961|          1|
|1003160| P00052842|     M|26-35|        17|            C|                         3|             0|                10|                15|              null|   23961|          1|
|1001474| P00052842|     M|26-35|         4|            A|                         2|             1|     

In [221]:
not_count_cat = diff_cat_in_test_train.rdd.map(lambda x: x[0]).collect()

In [222]:
not_count_cat

['P00300142',
 'P00077642',
 'P00092742',
 'P00082142',
 'P00062542',
 'P00013042',
 'P00279042',
 'P00227242',
 'P00359842',
 'P00061642',
 'P0099542',
 'P00306842',
 'P00140842',
 'P00165542',
 'P00268942',
 'P00236842',
 'P00172942',
 'P00012642',
 'P00336842',
 'P00105742',
 'P00309842',
 'P00100242',
 'P00315342',
 'P00168242',
 'P00156942',
 'P00039042',
 'P00056942',
 'P00322642',
 'P00249942',
 'P00294942',
 'P00106242',
 'P00239542',
 'P00074942',
 'P00030342',
 'P00063942',
 'P00042642',
 'P00322842',
 'P00038942',
 'P00270342',
 'P00312642',
 'P00166542',
 'P00082642',
 'P00253842',
 'P00062242',
 'P00058842',
 'P00204642']

In [230]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

Function1=udf(lambda x: '-1' if x in not_count_cat else x, StringType())

In [231]:
k=testDF.withColumn('NewProductId', Function1(testDF['ProductId'])).select('NewProductId')

In [232]:
k.where(k['NewProductId']==-1).show(3)

+------------+
|NewProductId|
+------------+
|          -1|
|          -1|
|          -1|
+------------+
only showing top 3 rows



In [244]:
from pyspark.sql.functions import lit, round, bround
trainDF.select(round(lit('2.5')), bround(lit('2.5'))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [243]:
spark.sql('select round(2.9), round(2.5), bround(2.4), bround(2.9)').show(3)

+-------------+-------------+--------------+--------------+
|round(2.9, 0)|round(2.5, 0)|bround(2.4, 0)|bround(2.9, 0)|
+-------------+-------------+--------------+--------------+
|            3|            3|             2|             3|
+-------------+-------------+--------------+--------------+



In [245]:
spark.sql('select round(2.5), bround(2.5)').show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|            3|             2|
+-------------+--------------+



In [252]:
# String Manipulations

from pyspark.sql.functions import lit, ltrim, rtrim, lpad, rpad, trim
trainDF.select(
    ltrim(lit(" HELLO ")).alias('ltrim'),
    rtrim(lit(" HELLO ")).alias('rtrim'),
    trim(lit(" HELLO ")).alias('trim'),
    lpad(lit("HELLO"),7," ").alias('lpad'),
    rpad(lit("HELLO"),7," ").alias('rpad')
).show(2)

+------+------+-----+-------+-------+
| ltrim| rtrim| trim|   lpad|   rpad|
+------+------+-----+-------+-------+
|HELLO | HELLO|HELLO|  HELLO|HELLO  |
|HELLO | HELLO|HELLO|  HELLO|HELLO  |
+------+------+-----+-------+-------+
only showing top 2 rows



In [254]:
spark.sql('''
    select 
    ltrim('  HELLO  '),
    rtrim('  HELLO  '),
    trim('  HELLO  '),
    lpad('HELLO',3," "),
    rpad('HELLO',10," ")
    from trainDFTable
''').show(3)

+----------------+----------------+---------------+-----------------+------------------+
|ltrim(  HELLO  )|rtrim(  HELLO  )|trim(  HELLO  )|lpad(HELLO, 3,  )|rpad(HELLO, 10,  )|
+----------------+----------------+---------------+-----------------+------------------+
|         HELLO  |           HELLO|          HELLO|              HEL|        HELLO     |
|         HELLO  |           HELLO|          HELLO|              HEL|        HELLO     |
|         HELLO  |           HELLO|          HELLO|              HEL|        HELLO     |
+----------------+----------------+---------------+-----------------+------------------+
only showing top 3 rows



In [255]:
from pyspark.sql.functions import regexp_replace
regex_string='F|M'

trainDF.select(
    regexp_replace(col('Gender'), regex_string, 'MALE_OR_FEMALE').alias('GENDER_DECODE'), col('GENDER')
).show(5)

+--------------+------+
| GENDER_DECODE|GENDER|
+--------------+------+
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     F|
|MALE_OR_FEMALE|     M|
+--------------+------+
only showing top 5 rows



In [265]:
spark.sql('''
    select 
    regex_replace(Gender, "F|M", "MALE_OR_FEMALE") as DECODE_GENDER, GENDER from trainDFTable
''').show(4)

Py4JJavaError: An error occurred while calling o25.sql.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.liftedTree1$1(InMemoryCatalog.scala:123)
	at org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.createDatabase(InMemoryCatalog.scala:120)
	at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:153)
	at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:140)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$catalog$1(BaseSessionStateBuilder.scala:154)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:121)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:121)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.databaseExists(SessionCatalog.scala:290)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.isPersistentFunction(SessionCatalog.scala:1547)
	at org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog.functionExists(V2SessionCatalog.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions$$anonfun$apply$23.applyOrElse(Analyzer.scala:2041)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions$$anonfun$apply$23.applyOrElse(Analyzer.scala:2032)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$3(TreeNode.scala:589)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1228)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1227)
	at org.apache.spark.sql.catalyst.expressions.UnaryExpression.mapChildren(Expression.scala:498)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:589)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$transformExpressionsDownWithPruning$1(QueryPlan.scala:159)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$1(QueryPlan.scala:200)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:200)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.recursiveTransform$1(QueryPlan.scala:211)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$3(QueryPlan.scala:216)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.immutable.List.map(List.scala:305)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.recursiveTransform$1(QueryPlan.scala:216)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$4(QueryPlan.scala:221)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:427)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:221)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsDownWithPruning(QueryPlan.scala:159)
	at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsWithPruning(QueryPlan.scala:130)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveExpressionsWithPruning$1.applyOrElse(AnalysisHelper.scala:245)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$$anonfun$resolveExpressionsWithPruning$1.applyOrElse(AnalysisHelper.scala:244)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$2(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning(AnalysisHelper.scala:99)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning$(AnalysisHelper.scala:96)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveExpressionsWithPruning(AnalysisHelper.scala:244)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveExpressionsWithPruning$(AnalysisHelper.scala:242)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveExpressionsWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions$.apply(Analyzer.scala:2032)
	at org.apache.spark.sql.catalyst.analysis.Analyzer$LookupFunctions$.apply(Analyzer.scala:2028)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:211)
	at scala.collection.IndexedSeqOptimized.foldLeft(IndexedSeqOptimized.scala:60)
	at scala.collection.IndexedSeqOptimized.foldLeft$(IndexedSeqOptimized.scala:68)
	at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:38)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:208)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:231)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:227)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:173)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:227)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:188)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:212)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:211)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:76)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:185)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:185)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:184)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:76)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:622)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:617)
	at jdk.internal.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 22 more


In [266]:
from pyspark.sql.functions import translate
trainDF.select(
    translate(col("Gender"),"FM","01"),
    col("Gender")
).show(10)

SyntaxError: invalid non-printable character U+00A0 (2032349683.py, line 3)

In [272]:
from pyspark.sql.functions import current_date,current_timestamp
df_date =spark.range(10).withColumn("today",current_date()).withColumn("now",current_timestamp())
df_date.show(3,truncate=False)



+---+----------+--------------------------+
|id |today     |now                       |
+---+----------+--------------------------+
|0  |2023-04-13|2023-04-13 10:08:40.529102|
|1  |2023-04-13|2023-04-13 10:08:40.529102|
|2  |2023-04-13|2023-04-13 10:08:40.529102|
+---+----------+--------------------------+
only showing top 3 rows



In [268]:
df_date.createOrReplaceTempView("dataDFTable")

In [269]:
from pyspark.sql.functions import col
from pyspark.sql.functions import date_add,date_sub
df_date.select(date_sub(col("today"),5),date_add(col("today"),5)).show(5)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
+------------------+------------------+
only showing top 5 rows



In [270]:
spark.sql('''
    select 
    date_sub(today, 5),
    date_add(today, 5)
    from dataDFTable
''').show()

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
|        2023-04-08|        2023-04-18|
+------------------+------------------+



In [271]:
from pyspark.sql.functions import datediff, months_between, to_date
df_date.withColumn('week_ago', date_sub(col('today'),7)).select(datediff(col('week_ago'),col('today'))).show(4)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
|                       -7|
|                       -7|
+-------------------------+
only showing top 4 rows

