In [None]:
#  Last amended:06th Sep, 2022
#  Myfolder: /home/ashok/Documents/spark
# Ref:
# Tutorials (slightly dated):
#      https://changhsinlee.com/pyspark-dataframe-basics/
#      https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/
# Cheat Sheet
#      https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf

#  Objectives:
#           Dataframe operations in spark cluster

pyspark APIs<br>
> i)  [DataFrame APIs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis)<br>
>> df.select(columnName).where(colObject > 30).orderBy(desc(columnName))<br>
>> df.select(columnName).where("colName > 30").orderBy(desc(columnName))<br>

> ii) [Column APIs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#column-apis)<br>
>> df.select(df.age.isNull())<br>
>> df.select(df["age"].isNull())<br>
>> df.select(col("age").isNull())<br>

> iii)[Data Tyoes](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#data-types)<br>
> iv) [Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#functions)<br>
>> df.select(sum("age"))<br>
>> df.select(sum(col(booleanColumn).cast("int")))<br>
>> <u>but you must import the functions</u>

> v)  [Grouping](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#grouping)<br>

A. Initial operations:
1.0 Start hadoop in a terminal:

            ./allstart.sh
            OR
            ./quick_allstart.sh

## Transfer files to hadoop

In [2]:
# 1.1 Transfer data file 'blackfridayless.csv' to hadoop
#     Linux File folder:  /cdata/misc_datasets/black_friday
#     In Hadoop first make a folder: /user/ashok/datadir 
#     and then transfer the file 'blackfridayless.csv' to 
#     this folder: /user/ashok/datadir

"""

cd ~
hdfs dfs -rm -f -r  /user/ashok/datadir
hdfs dfs -mkdir /user/ashok/datadir
hdfs dfs -put /cdata/misc_datasets/black_friday/blackfridayless.csv  /user/ashok/datadir
hdfs dfs -ls /user/ashok/datadir


"""

!cd ~
!hdfs dfs -rm -f -r  /user/ashok/datadir
!hdfs dfs -mkdir /user/ashok/datadir
!hdfs dfs -put /cdata/misc_datasets/black_friday/blackfridayless.csv  /user/ashok/datadir
!hdfs dfs -ls /user/ashok/datadir




Deleted /user/ashok/datadir
Found 1 items
-rw-r--r--   1 ashok supergroup    4582364 2022-09-06 15:46 /user/ashok/datadir/blackfridayless.csv


## Set jupyter notebook options
Start pyspark with jupyter notebook interface. There is no need to create SparkContext and Spark session. pyspark creates them when starting.

In [3]:
# 1.2 Display multiple outputs from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# 1.3 Increase cell width to display wide columnar output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


### Read the csv file from hadoop

In [None]:
###### Read file 'blackfridayless.csv' from hadoop

# 2.0 What is the URL of folder in hadoop where blackfriday file existshaving the file?
#     url: "http://localhost:9000/<folderPath>"


URL_of_folder= ""

In [None]:
# 2.1 Read the file blackfridayless.csv. Takes time.
#      Use 'spark.read.csv' session object to read file:
#       Here is reading template:

blackfriday = spark.read.csv(
                             path = URL_of_folder + <filename> ,
                             inferSchema =                  # True or False
                             header =                       # True or False
                             sep =                          # Which one: , ;, | etc
                             ignoreLeadingWhiteSpace =      # True or False
                             ignoreTrailingWhiteSpace =     # True of False
    
                            )

## Explore the dataframe

In [None]:
# 2.2 Show five rows of data:




In [None]:
# 2.3 Show data columns




In [None]:
# 2.4 Show dtypes:



In [None]:
# 2.5 Print schema of blackfriday:



In [None]:
# 3.0 Describe the statistics of data, few columns at a time:



In [None]:
# 3.1 Count How many distinct userids are there 
#     Use distinct() and count()




In [None]:
# 3.2 Count how many distinct age-groups exist



In [None]:
# 4.0 How many null values occur in each column

from pyspark.sql.functions import isnan, isnull,col, sum, max

In [None]:
# 4.1
for i in blackfriday.columns:
    blackfriday.select(sum(col(i).isNull().alias("nullcol").cast("int")).alias(i)).show()    

These columns have null values. Most probably it means that there is no sub-category or sub-categories present. <br>
How would you plan to fill them?<br>
productCat1 :  0 <br>
productCat2 :  31429 <br>
productCat3 :  70100 <br>

In [None]:
# 5.0 Get a list of all integer columns and string columns
#     Use list comprhension along with dtypes:



In [None]:
# 5.1 Display maximum of productCat2 and productCat3
#     Use select() along with 'max' function



In [None]:
# 5.2 Find minimum and max values of 'occupation' column



In [None]:
# 5.3 Fill null values in productCat2 and productCat3 with 999
#     Use df.na.fill()



In [None]:
# 6.0 Transform spark dataframe to pandas dataframe:
#     Use df.toPandas()




In [None]:
# 6.1 Show a value count of levels of column 'cityCategory':
#      Use groupby and count




In [None]:
# 7.0 Perform a stratified sampling of data.
#      Stratified sampling be by column: 'cityCategory'
#        Take 80% from 'B' and 20% from 'C'

sample = blackfriday.sampleBy(
                      "cityCategory",      # column that defines strata
                      fractions = {'B' : 0.8, 'C' : 0.2})   # sampling fraction for each stratum

## Using verbs
>select, <br>
><i>select(x).where()</i>,<br>
><i>select().distinct()</i>,<br>
>filter,<br>
>groupby

### select syntax
> DataFrame.select(\*cols)<br>
> cols: column names (string) or expressions (Column). If one of the column names is ‘*’, that column is expanded to include all columns in the current DataFrame.


In [None]:
# 8.0 Show columns 3rd till 5th



### filter syntax
>DataFrame.filter(condition)<br>
>condition: <i>columnObject > 34</i> or string format: <i>"age > 34"</i>
>>  df.age > 3 or col("age") > 3<br>
>>  "age > 3" <br>
>>Logical Operators<br>
>>> If string: AND OR NOT<br>
>>> If columnObject: &, |, ~ <br>

In [None]:
# 8.1 Filter purchases less than 9000
#      Use filter()



In [None]:
# 8.2 Filter purchases less than 9000 and maritalStatus is 0



In [None]:
# 8.3 Filter purchases less than 9000 or maritalStatus is 0



In [None]:
# 9.0 Combining verbs: select, filter and distinct

airports_df.select('dst', 'tz'). \
            filter(airports_df.tz == -5). \
            show(3)

# 10.1
airports_df.select('dst', 'tz'). \
            filter(airports_df.tz == -5). \
            distinct(). \
            show(3)

## Aggregation with groupby
Use: <i>.agg({'colName1' : 'mean', 'colName2' : 'sum'})</i> <br>
>With <i>agg()</i> one can use only builtin functions and not any other <i>pyspark.sql.function</i>.<br>
Some common functions are: <i>mean, avg, sum, count, first, last,stddev </i>. There is no need to import builtin function in advance.<br>
For a complete list of builtin functions see [here](https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/).


In [None]:
# Find max of 'maritalStatus' and max of 'purchase'



In [None]:
# 12. groupby. Can apply sum, min, max, count

airports_df.groupby('tz'). \
           count(). \
           show(3)

# 12.1
airports_df.groupby('tz'). \
            agg({'lat' : 'mean'}). \
            show(3)

In [None]:
"""
Unpacking operator in python (*) : 
Ref: https://codeyarns.com/2012/04/26/unpack-operator-in-python/ 

def fox(a,b):  
    return (a *b)  

m = [3,4]  
fox(m)    
fox(*m)   

"""

In [None]:
# 12.2 One can take the average of columns by passing
#       an unpacked list of column names.

grObject = airports_df.groupby('tz')

avg_cols = ['lat', 'lon']
grObject.avg(*avg_cols).show(3)

In [None]:
# 12.3 To call multiple aggregation functions at once, pass a dictionary.
#         The 'key' of dictionary becomes argument to 'value'.
#                             count(*)        avg(lat)      sum(lon)

grObject.agg({'*': 'count', 'lat': 'avg', 'lon':'sum'}).show(2)

## Column manipulation

In [None]:
# 8. Create new columns in Spark using .withColumn() --mutate
#      New column: altInThousands . 
#      Product of two columns:  'alt' and  'lon' 




In [None]:
# 9. Save the new file with additional column in parquet form

xyz = airports_df.withColumn('altInThousands', airports_df.alt*airports_df.lon)
xyz.write.parquet("hdfs://localhost:9000/user/ashok/data_files/airports_extra.parquet")

In [None]:
# 9.1 Delete xyz from spark
import gc
del xyz
gc.collect()    # Delete all cache also

In [None]:
# 9.2 Read the stored parquet file
df = spark.read.parquet("hdfs://localhost:9000/user/ashok/data_files/airports_extra.parquet")
df.show(3)

## Joining tables

In [None]:
# 9.3 Read 'weather.csv file into spark from hadoop

URL_of_file= "hdfs://localhost:9000/user/ashok/data_files/nycflights/"
weather_df = spark.read.csv(path = URL_of_file + "weather.csv",
                            inferSchema = True,
                            header = True
                           )
weather_df.show(3)

In [None]:
# 10. Joins
# Refer: http://www.learnbymarketing.com/1100/pyspark-joins-by-example/
# For example, I can join the two titanic dataframes by the column PassengerId

# 10.1
airports_df.join(weather_df, airports_df.faa==weather_df.origin).show(3)
# 10.2
airports_df.join(weather_df, airports_df.faa==weather_df.origin, how = 'inner').show(3)
# 10.3
airports_df.join(weather_df, airports_df.faa==weather_df.origin, how = 'left').show(3)   # Could also use 'left_outer', 'right', 'full'


## SQL queries against DataFrame

In [None]:
# 11. Many of the operations can be accessed by writing SQL queries in spark.sql().
# To make an existing Spark dataframe usable for spark.sql(), one needs to
#   register said dataframe as a temporary table.

# 11.1 As an example, we can register the two dataframes as temp tables then
#      join them through spark.sql().

airports_df.createOrReplaceTempView('dfa_temp')
weather_df.createOrReplaceTempView('dfw_temp')

In [None]:
# 11.2 Simple SQL query. SQLContext is no longer needed. 'spark'
#            session object can be used.

dfj = spark.sql('select * from dfa_temp' )
dfj.show(3)

In [None]:
# 11.3 Now the SQL join

dfj = spark.sql('select * from dfa_temp a, dfw_temp b where a.faa = b.origin' )
dfj.show(3)


## Misc

In [None]:
# 12. Drop a columns

airports_df.drop('name').show(3)

# 12.1  Or drop multiple columns

columns_to_drop = ['name', 'lat']
xx =airports_df.drop(*columns_to_drop)
xx.show(3)

In [None]:
########### I am done ####################